for-6.8/block-2024-01-08
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmWcIOIQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpn6hD/9oO7U75PuxUwYYHZ9Uzxpw6gQ0LEmeyJmE NQYCkfYHVq3IsgOdF7elI9v3qtr6v8V8CdB7cByrnn3DgwsMuiTKZZ0dK7vH37PO DX+/xn349e8oH7RdRo7f3m95g1YbHfpfnj0Rc4mjTDV72Jr/HlLTVgGTQg8DEnCR wBIFmeuBHHgeeLh87gsWLAP7ReReiy9V1uqpDFsko2/4BxRAM/8eedkwcAxD8aEy rd+dT/SBQj2cOdQMUeExT3gWjwzHh6ZHx3f1WCLK5fdck6BogH2hBUeri6F/H98L HoaXjBZYBTH68hB/mnO5I4g1ZlrVM74Vp7JPa3e1SFFtyEi6lsyrk2J3GoNh0E7r pXqH5kAcaJwBsBrbRGuvEyGbn9RLTaN5Gvseud0VE4oMruyodTniQaHXuIGackgz sMavMho4486EUWPaF7gIBdLNK1hO13w+IDZ4+3oBxhudMqdgZbk4iYpOCqQ7QY5G 2vkzAE/sZ+aVNXeaIQOI8dE5clBy8gJ+6+t8dm3DY1r1xdbcnU40iZ8/fri3h69r vHs9bpQnVWZF0gEyEflY1pkcAPpIkvMmWCR7Ehy5YCkIfa+qfSL05o3dicpWovLP N+gCtpkhTK2AvmUWsUMypMLRvoSOImyCIiobrr3qNBaUdgRP8xKfUa72RuRp8cGl Vrj5oAiE3w== =YAfp -----END PGP SIGNATURE----- Merge tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux Pull block updates from Jens Axboe: "Pretty quiet round this time around. This contains: - NVMe updates via Keith: - nvme fabrics spec updates (Guixin, Max) - nvme target udpates (Guixin, Evan) - nvme attribute refactoring (Daniel) - nvme-fc numa fix (Keith) - MD updates via Song: - Fix/Cleanup RCU usage from conf->disks[i].rdev (Yu Kuai) - Fix raid5 hang issue (Junxiao Bi) - Add Yu Kuai as Reviewer of the md subsystem - Remove deprecated flavors (Song Liu) - raid1 read error check support (Li Nan) - Better handle events off-by-1 case (Alex Lyakas) - Efficiency improvements for passthrough (Kundan) - Support for mapping integrity data directly (Keith) - Zoned write fix (Damien) - rnbd fixes (Kees, Santosh, Supriti) - Default to a sane discard size granularity (Christoph) - Make the default max transfer size naming less confusing (Christoph) - Remove support for deprecated host aware zoned model (Christoph) - Misc fixes (me, Li, Matthew, Min, Ming, Randy, liyouhong, Daniel, Bart, Christoph)" * tag 'for-6.8/block-2024-01-08' of git://git.kernel.dk/linux: (78 commits) block: Treat sequential write preferred zone type as invalid block: remove disk_clear_zoned sd: remove the !ZBC && blk_queue_is_zoned case in sd_read_block_characteristics drivers/block/xen-blkback/common.h: Fix spelling typo in comment blk-cgroup: fix rcu lockdep warning in blkg_lookup() blk-cgroup: don't use removal safe list iterators block: floor the discard granularity to the physical block size mtd_blkdevs: use the default discard granularity bcache: use the default discard granularity zram: use the default discard granularity null_blk: use the default discard granularity nbd: use the default discard granularity ubd: use the default discard granularity block: default the discard granularity to sector size bcache: discard_granularity should not be smaller than a sector block: remove two comments in bio_split_discard block: rename and document BLK_DEF_MAX_SECTORS loop: don't abuse BLK_DEF_MAX_SECTORS aoe: don't abuse BLK_DEF_MAX_SECTORS null_blk: don't cap max_hw_sectors to BLK_DEF_MAX_SECTORS ...
This commit is contained in:
commit
01d550f0fc
@ -20079,6 +20079,7 @@ F: include/linux/property.h
|
||||
|
||||
SOFTWARE RAID (Multiple Disks) SUPPORT
|
||||
M: Song Liu <song@kernel.org>
|
||||
R: Yu Kuai <yukuai3@huawei.com>
|
||||
L: linux-raid@vger.kernel.org
|
||||
S: Supported
|
||||
Q: https://patchwork.kernel.org/project/linux-raid/list/
|
||||
|
@ -798,7 +798,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
|
||||
ubd_dev->cow.fd = err;
|
||||
}
|
||||
if (ubd_dev->no_trim == 0) {
|
||||
ubd_dev->queue->limits.discard_granularity = SECTOR_SIZE;
|
||||
blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
|
||||
blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
|
||||
}
|
||||
|
@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
|
||||
|
||||
memset(bip, 0, sizeof(*bip));
|
||||
|
||||
/* always report as many vecs as asked explicitly, not inline vecs */
|
||||
bip->bip_max_vcnt = nr_vecs;
|
||||
if (nr_vecs > inline_vecs) {
|
||||
bip->bip_max_vcnt = nr_vecs;
|
||||
bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
|
||||
&bip->bip_max_vcnt, gfp_mask);
|
||||
if (!bip->bip_vec)
|
||||
goto err;
|
||||
} else {
|
||||
bip->bip_vec = bip->bip_inline_vecs;
|
||||
bip->bip_max_vcnt = inline_vecs;
|
||||
}
|
||||
|
||||
bip->bip_bio = bio;
|
||||
@ -91,6 +91,47 @@ err:
|
||||
}
|
||||
EXPORT_SYMBOL(bio_integrity_alloc);
|
||||
|
||||
static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
|
||||
bool dirty)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nr_vecs; i++) {
|
||||
if (dirty && !PageCompound(bv[i].bv_page))
|
||||
set_page_dirty_lock(bv[i].bv_page);
|
||||
unpin_user_page(bv[i].bv_page);
|
||||
}
|
||||
}
|
||||
|
||||
static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
|
||||
{
|
||||
unsigned short nr_vecs = bip->bip_max_vcnt - 1;
|
||||
struct bio_vec *copy = &bip->bip_vec[1];
|
||||
size_t bytes = bip->bip_iter.bi_size;
|
||||
struct iov_iter iter;
|
||||
int ret;
|
||||
|
||||
iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
|
||||
ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
|
||||
WARN_ON_ONCE(ret != bytes);
|
||||
|
||||
bio_integrity_unpin_bvec(copy, nr_vecs, true);
|
||||
}
|
||||
|
||||
static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
|
||||
{
|
||||
bool dirty = bio_data_dir(bip->bip_bio) == READ;
|
||||
|
||||
if (bip->bip_flags & BIP_COPY_USER) {
|
||||
if (dirty)
|
||||
bio_integrity_uncopy_user(bip);
|
||||
kfree(bvec_virt(bip->bip_vec));
|
||||
return;
|
||||
}
|
||||
|
||||
bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
|
||||
}
|
||||
|
||||
/**
|
||||
* bio_integrity_free - Free bio integrity payload
|
||||
* @bio: bio containing bip to be freed
|
||||
@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio)
|
||||
|
||||
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
|
||||
kfree(bvec_virt(bip->bip_vec));
|
||||
else if (bip->bip_flags & BIP_INTEGRITY_USER)
|
||||
bio_integrity_unmap_user(bip);
|
||||
|
||||
__bio_integrity_free(bs, bip);
|
||||
bio->bi_integrity = NULL;
|
||||
@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL(bio_integrity_add_page);
|
||||
|
||||
static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
|
||||
int nr_vecs, unsigned int len,
|
||||
unsigned int direction, u32 seed)
|
||||
{
|
||||
bool write = direction == ITER_SOURCE;
|
||||
struct bio_integrity_payload *bip;
|
||||
struct iov_iter iter;
|
||||
void *buf;
|
||||
int ret;
|
||||
|
||||
buf = kmalloc(len, GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
if (write) {
|
||||
iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
|
||||
if (!copy_from_iter_full(buf, len, &iter)) {
|
||||
ret = -EFAULT;
|
||||
goto free_buf;
|
||||
}
|
||||
|
||||
bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
|
||||
} else {
|
||||
memset(buf, 0, len);
|
||||
|
||||
/*
|
||||
* We need to preserve the original bvec and the number of vecs
|
||||
* in it for completion handling
|
||||
*/
|
||||
bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
|
||||
}
|
||||
|
||||
if (IS_ERR(bip)) {
|
||||
ret = PTR_ERR(bip);
|
||||
goto free_buf;
|
||||
}
|
||||
|
||||
if (write)
|
||||
bio_integrity_unpin_bvec(bvec, nr_vecs, false);
|
||||
else
|
||||
memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
|
||||
|
||||
ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
|
||||
offset_in_page(buf));
|
||||
if (ret != len) {
|
||||
ret = -ENOMEM;
|
||||
goto free_bip;
|
||||
}
|
||||
|
||||
bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
|
||||
bip->bip_iter.bi_sector = seed;
|
||||
return 0;
|
||||
free_bip:
|
||||
bio_integrity_free(bio);
|
||||
free_buf:
|
||||
kfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
|
||||
int nr_vecs, unsigned int len, u32 seed)
|
||||
{
|
||||
struct bio_integrity_payload *bip;
|
||||
|
||||
bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
|
||||
if (IS_ERR(bip))
|
||||
return PTR_ERR(bip);
|
||||
|
||||
memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
|
||||
bip->bip_flags |= BIP_INTEGRITY_USER;
|
||||
bip->bip_iter.bi_sector = seed;
|
||||
bip->bip_iter.bi_size = len;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
|
||||
int nr_vecs, ssize_t bytes, ssize_t offset)
|
||||
{
|
||||
unsigned int nr_bvecs = 0;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < nr_vecs; i = j) {
|
||||
size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
|
||||
struct folio *folio = page_folio(pages[i]);
|
||||
|
||||
bytes -= size;
|
||||
for (j = i + 1; j < nr_vecs; j++) {
|
||||
size_t next = min_t(size_t, PAGE_SIZE, bytes);
|
||||
|
||||
if (page_folio(pages[j]) != folio ||
|
||||
pages[j] != pages[j - 1] + 1)
|
||||
break;
|
||||
unpin_user_page(pages[j]);
|
||||
size += next;
|
||||
bytes -= next;
|
||||
}
|
||||
|
||||
bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
|
||||
offset = 0;
|
||||
nr_bvecs++;
|
||||
}
|
||||
|
||||
return nr_bvecs;
|
||||
}
|
||||
|
||||
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
|
||||
u32 seed)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
|
||||
struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
|
||||
struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
|
||||
unsigned int direction, nr_bvecs;
|
||||
struct iov_iter iter;
|
||||
int ret, nr_vecs;
|
||||
size_t offset;
|
||||
bool copy;
|
||||
|
||||
if (bio_integrity(bio))
|
||||
return -EINVAL;
|
||||
if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
|
||||
return -E2BIG;
|
||||
|
||||
if (bio_data_dir(bio) == READ)
|
||||
direction = ITER_DEST;
|
||||
else
|
||||
direction = ITER_SOURCE;
|
||||
|
||||
iov_iter_ubuf(&iter, direction, ubuf, bytes);
|
||||
nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
|
||||
if (nr_vecs > BIO_MAX_VECS)
|
||||
return -E2BIG;
|
||||
if (nr_vecs > UIO_FASTIOV) {
|
||||
bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
|
||||
if (!bvec)
|
||||
return -ENOMEM;
|
||||
pages = NULL;
|
||||
}
|
||||
|
||||
copy = !iov_iter_is_aligned(&iter, align, align);
|
||||
ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
|
||||
if (unlikely(ret < 0))
|
||||
goto free_bvec;
|
||||
|
||||
nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
|
||||
if (pages != stack_pages)
|
||||
kvfree(pages);
|
||||
if (nr_bvecs > queue_max_integrity_segments(q))
|
||||
copy = true;
|
||||
|
||||
if (copy)
|
||||
ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
|
||||
direction, seed);
|
||||
else
|
||||
ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
|
||||
if (ret)
|
||||
goto release_pages;
|
||||
if (bvec != stack_vec)
|
||||
kfree(bvec);
|
||||
|
||||
return 0;
|
||||
|
||||
release_pages:
|
||||
bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
|
||||
free_bvec:
|
||||
if (bvec != stack_vec)
|
||||
kfree(bvec);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_integrity_map_user);
|
||||
|
||||
/**
|
||||
* bio_integrity_process - Process integrity metadata for a bio
|
||||
* @bio: bio to generate/verify integrity metadata for
|
||||
|
53
block/bio.c
53
block/bio.c
@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
|
||||
|
||||
if ((addr1 | mask) != (addr2 | mask))
|
||||
return false;
|
||||
if (bv->bv_len + len > queue_max_segment_size(q))
|
||||
if (len > queue_max_segment_size(q) - bv->bv_len)
|
||||
return false;
|
||||
return bvec_try_merge_page(bv, page, len, offset, same_page);
|
||||
}
|
||||
@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page)
|
||||
{
|
||||
unsigned int max_size = max_sectors << SECTOR_SHIFT;
|
||||
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return 0;
|
||||
|
||||
if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
|
||||
len = min3(len, max_size, queue_max_segment_size(q));
|
||||
if (len > max_size - bio->bi_iter.bi_size)
|
||||
return 0;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
@ -1145,13 +1148,22 @@ EXPORT_SYMBOL(bio_add_folio);
|
||||
|
||||
void __bio_release_pages(struct bio *bio, bool mark_dirty)
|
||||
{
|
||||
struct bvec_iter_all iter_all;
|
||||
struct bio_vec *bvec;
|
||||
struct folio_iter fi;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
if (mark_dirty && !PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
bio_release_page(bio, bvec->bv_page);
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
struct page *page;
|
||||
size_t done = 0;
|
||||
|
||||
if (mark_dirty) {
|
||||
folio_lock(fi.folio);
|
||||
folio_mark_dirty(fi.folio);
|
||||
folio_unlock(fi.folio);
|
||||
}
|
||||
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
|
||||
do {
|
||||
bio_release_page(bio, page++);
|
||||
done += PAGE_SIZE;
|
||||
} while (done < fi.length);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__bio_release_pages);
|
||||
@ -1439,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages);
|
||||
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
|
||||
* for performing direct-IO in BIOs.
|
||||
*
|
||||
* The problem is that we cannot run set_page_dirty() from interrupt context
|
||||
* The problem is that we cannot run folio_mark_dirty() from interrupt context
|
||||
* because the required locks are not interrupt-safe. So what we can do is to
|
||||
* mark the pages dirty _before_ performing IO. And in interrupt context,
|
||||
* check that the pages are still dirty. If so, fine. If not, redirty them
|
||||
* in process context.
|
||||
*
|
||||
* We special-case compound pages here: normally this means reads into hugetlb
|
||||
* pages. The logic in here doesn't really work right for compound pages
|
||||
* because the VM does not uniformly chase down the head page in all cases.
|
||||
* But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
|
||||
* handle them at all. So we skip compound pages here at an early stage.
|
||||
*
|
||||
* Note that this code is very hard to test under normal circumstances because
|
||||
* direct-io pins the pages with get_user_pages(). This makes
|
||||
* is_page_cache_freeable return false, and the VM will not clean the pages.
|
||||
@ -1466,12 +1472,12 @@ EXPORT_SYMBOL(bio_free_pages);
|
||||
*/
|
||||
void bio_set_pages_dirty(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
struct bvec_iter_all iter_all;
|
||||
struct folio_iter fi;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
if (!PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
folio_lock(fi.folio);
|
||||
folio_mark_dirty(fi.folio);
|
||||
folio_unlock(fi.folio);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
|
||||
@ -1515,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work)
|
||||
|
||||
void bio_check_pages_dirty(struct bio *bio)
|
||||
{
|
||||
struct bio_vec *bvec;
|
||||
struct folio_iter fi;
|
||||
unsigned long flags;
|
||||
struct bvec_iter_all iter_all;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
|
||||
bio_for_each_folio_all(fi, bio) {
|
||||
if (!folio_test_dirty(fi.folio))
|
||||
goto defer;
|
||||
}
|
||||
|
||||
|
@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg)
|
||||
static void blkg_destroy_all(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
struct blkcg_gq *blkg, *n;
|
||||
struct blkcg_gq *blkg;
|
||||
int count = BLKG_DESTROY_BATCH_SIZE;
|
||||
int i;
|
||||
|
||||
restart:
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
|
||||
list_for_each_entry(blkg, &q->blkg_list, q_node) {
|
||||
struct blkcg *blkcg = blkg->blkcg;
|
||||
|
||||
if (hlist_unhashed(&blkg->blkcg_node))
|
||||
@ -2064,6 +2064,9 @@ void bio_associate_blkg(struct bio *bio)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
if (blk_op_is_passthrough(bio->bi_opf))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
if (bio->bi_blkg)
|
||||
|
@ -252,7 +252,8 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
|
||||
if (blkcg == &blkcg_root)
|
||||
return q->root_blkg;
|
||||
|
||||
blkg = rcu_dereference(blkcg->blkg_hint);
|
||||
blkg = rcu_dereference_check(blkcg->blkg_hint,
|
||||
lockdep_is_held(&q->queue_lock));
|
||||
if (blkg && blkg->q == q)
|
||||
return blkg;
|
||||
|
||||
|
@ -772,6 +772,15 @@ void submit_bio_noacct(struct bio *bio)
|
||||
bio_clear_polled(bio);
|
||||
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_READ:
|
||||
case REQ_OP_WRITE:
|
||||
break;
|
||||
case REQ_OP_FLUSH:
|
||||
/*
|
||||
* REQ_OP_FLUSH can't be submitted through bios, it is only
|
||||
* synthetized in struct request by the flush state machine.
|
||||
*/
|
||||
goto not_supported;
|
||||
case REQ_OP_DISCARD:
|
||||
if (!bdev_max_discard_sectors(bdev))
|
||||
goto not_supported;
|
||||
@ -785,6 +794,10 @@ void submit_bio_noacct(struct bio *bio)
|
||||
if (status != BLK_STS_OK)
|
||||
goto end_io;
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
if (!q->limits.max_write_zeroes_sectors)
|
||||
goto not_supported;
|
||||
break;
|
||||
case REQ_OP_ZONE_RESET:
|
||||
case REQ_OP_ZONE_OPEN:
|
||||
case REQ_OP_ZONE_CLOSE:
|
||||
@ -796,12 +809,15 @@ void submit_bio_noacct(struct bio *bio)
|
||||
if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
|
||||
goto not_supported;
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
if (!q->limits.max_write_zeroes_sectors)
|
||||
goto not_supported;
|
||||
break;
|
||||
case REQ_OP_DRV_IN:
|
||||
case REQ_OP_DRV_OUT:
|
||||
/*
|
||||
* Driver private operations are only used with passthrough
|
||||
* requests.
|
||||
*/
|
||||
fallthrough;
|
||||
default:
|
||||
break;
|
||||
goto not_supported;
|
||||
}
|
||||
|
||||
if (blk_throtl_bio(bio))
|
||||
|
@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio,
|
||||
|
||||
*nsegs = 1;
|
||||
|
||||
/* Zero-sector (unknown) and one-sector granularities are the same. */
|
||||
granularity = max(lim->discard_granularity >> 9, 1U);
|
||||
|
||||
max_discard_sectors =
|
||||
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
|
||||
max_discard_sectors -= max_discard_sectors % granularity;
|
||||
|
||||
if (unlikely(!max_discard_sectors)) {
|
||||
/* XXX: warn */
|
||||
if (unlikely(!max_discard_sectors))
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (bio_sectors(bio) <= max_discard_sectors)
|
||||
return NULL;
|
||||
|
@ -1248,7 +1248,8 @@ void blk_mq_start_request(struct request *rq)
|
||||
|
||||
trace_block_rq_issue(rq);
|
||||
|
||||
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
|
||||
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
|
||||
!blk_rq_is_passthrough(rq)) {
|
||||
rq->io_start_time_ns = ktime_get_ns();
|
||||
rq->stats_sectors = blk_rq_sectors(rq);
|
||||
rq->rq_flags |= RQF_STATS;
|
||||
|
@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
|
||||
|
||||
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
|
||||
{
|
||||
if (q->rq_qos)
|
||||
if (q->rq_qos && !blk_rq_is_passthrough(rq))
|
||||
__rq_qos_done(q->rq_qos, rq);
|
||||
}
|
||||
|
||||
|
@ -48,7 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
|
||||
lim->max_discard_sectors = 0;
|
||||
lim->max_hw_discard_sectors = 0;
|
||||
lim->max_secure_erase_sectors = 0;
|
||||
lim->discard_granularity = 0;
|
||||
lim->discard_granularity = 512;
|
||||
lim->discard_alignment = 0;
|
||||
lim->discard_misaligned = 0;
|
||||
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
|
||||
@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim)
|
||||
lim->alignment_offset = 0;
|
||||
lim->io_opt = 0;
|
||||
lim->misaligned = 0;
|
||||
lim->zoned = BLK_ZONED_NONE;
|
||||
lim->zoned = false;
|
||||
lim->zone_write_granularity = 0;
|
||||
lim->dma_alignment = 511;
|
||||
}
|
||||
@ -127,8 +127,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
|
||||
|
||||
if ((max_hw_sectors << 9) < PAGE_SIZE) {
|
||||
max_hw_sectors = 1 << (PAGE_SHIFT - 9);
|
||||
printk(KERN_INFO "%s: set to minimum %d\n",
|
||||
__func__, max_hw_sectors);
|
||||
pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
|
||||
}
|
||||
|
||||
max_hw_sectors = round_down(max_hw_sectors,
|
||||
@ -140,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
|
||||
if (limits->max_user_sectors)
|
||||
max_sectors = min(max_sectors, limits->max_user_sectors);
|
||||
else
|
||||
max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS);
|
||||
max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
|
||||
|
||||
max_sectors = round_down(max_sectors,
|
||||
limits->logical_block_size >> SECTOR_SHIFT);
|
||||
@ -248,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
|
||||
{
|
||||
if (!max_segments) {
|
||||
max_segments = 1;
|
||||
printk(KERN_INFO "%s: set to minimum %d\n",
|
||||
__func__, max_segments);
|
||||
pr_info("%s: set to minimum %u\n", __func__, max_segments);
|
||||
}
|
||||
|
||||
q->limits.max_segments = max_segments;
|
||||
@ -285,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
|
||||
{
|
||||
if (max_size < PAGE_SIZE) {
|
||||
max_size = PAGE_SIZE;
|
||||
printk(KERN_INFO "%s: set to minimum %d\n",
|
||||
__func__, max_size);
|
||||
pr_info("%s: set to minimum %u\n", __func__, max_size);
|
||||
}
|
||||
|
||||
/* see blk_queue_virt_boundary() for the explanation */
|
||||
@ -312,6 +309,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
|
||||
|
||||
limits->logical_block_size = size;
|
||||
|
||||
if (limits->discard_granularity < limits->logical_block_size)
|
||||
limits->discard_granularity = limits->logical_block_size;
|
||||
|
||||
if (limits->physical_block_size < size)
|
||||
limits->physical_block_size = size;
|
||||
|
||||
@ -342,6 +342,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
|
||||
if (q->limits.physical_block_size < q->limits.logical_block_size)
|
||||
q->limits.physical_block_size = q->limits.logical_block_size;
|
||||
|
||||
if (q->limits.discard_granularity < q->limits.physical_block_size)
|
||||
q->limits.discard_granularity = q->limits.physical_block_size;
|
||||
|
||||
if (q->limits.io_min < q->limits.physical_block_size)
|
||||
q->limits.io_min = q->limits.physical_block_size;
|
||||
}
|
||||
@ -740,8 +743,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
|
||||
{
|
||||
if (mask < PAGE_SIZE - 1) {
|
||||
mask = PAGE_SIZE - 1;
|
||||
printk(KERN_INFO "%s: set to minimum %lx\n",
|
||||
__func__, mask);
|
||||
pr_info("%s: set to minimum %lx\n", __func__, mask);
|
||||
}
|
||||
|
||||
q->limits.seg_boundary_mask = mask;
|
||||
@ -841,8 +843,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
|
||||
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
|
||||
else
|
||||
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
|
||||
|
||||
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
|
||||
|
||||
@ -884,81 +884,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
|
||||
|
||||
static bool disk_has_partitions(struct gendisk *disk)
|
||||
{
|
||||
unsigned long idx;
|
||||
struct block_device *part;
|
||||
bool ret = false;
|
||||
|
||||
rcu_read_lock();
|
||||
xa_for_each(&disk->part_tbl, idx, part) {
|
||||
if (bdev_is_partition(part)) {
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* disk_set_zoned - configure the zoned model for a disk
|
||||
* @disk: the gendisk of the queue to configure
|
||||
* @model: the zoned model to set
|
||||
*
|
||||
* Set the zoned model of @disk to @model.
|
||||
*
|
||||
* When @model is BLK_ZONED_HM (host managed), this should be called only
|
||||
* if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
|
||||
* If @model specifies BLK_ZONED_HA (host aware), the effective model used
|
||||
* depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
|
||||
* on the disk.
|
||||
* disk_set_zoned - inidicate a zoned device
|
||||
* @disk: gendisk to configure
|
||||
*/
|
||||
void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
|
||||
void disk_set_zoned(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
unsigned int old_model = q->limits.zoned;
|
||||
|
||||
switch (model) {
|
||||
case BLK_ZONED_HM:
|
||||
/*
|
||||
* Host managed devices are supported only if
|
||||
* CONFIG_BLK_DEV_ZONED is enabled.
|
||||
*/
|
||||
WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
|
||||
break;
|
||||
case BLK_ZONED_HA:
|
||||
/*
|
||||
* Host aware devices can be treated either as regular block
|
||||
* devices (similar to drive managed devices) or as zoned block
|
||||
* devices to take advantage of the zone command set, similarly
|
||||
* to host managed devices. We try the latter if there are no
|
||||
* partitions and zoned block device support is enabled, else
|
||||
* we do nothing special as far as the block layer is concerned.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
|
||||
disk_has_partitions(disk))
|
||||
model = BLK_ZONED_NONE;
|
||||
break;
|
||||
case BLK_ZONED_NONE:
|
||||
default:
|
||||
if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
|
||||
model = BLK_ZONED_NONE;
|
||||
break;
|
||||
}
|
||||
WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
|
||||
|
||||
q->limits.zoned = model;
|
||||
if (model != BLK_ZONED_NONE) {
|
||||
/*
|
||||
* Set the zone write granularity to the device logical block
|
||||
* size by default. The driver can change this value if needed.
|
||||
*/
|
||||
blk_queue_zone_write_granularity(q,
|
||||
queue_logical_block_size(q));
|
||||
} else if (old_model != BLK_ZONED_NONE) {
|
||||
disk_clear_zone_settings(disk);
|
||||
}
|
||||
/*
|
||||
* Set the zone write granularity to the device logical block
|
||||
* size by default. The driver can change this value if needed.
|
||||
*/
|
||||
q->limits.zoned = true;
|
||||
blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(disk_set_zoned);
|
||||
|
||||
|
@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
|
||||
if (max_sectors_kb == 0) {
|
||||
q->limits.max_user_sectors = 0;
|
||||
max_sectors_kb = min(max_hw_sectors_kb,
|
||||
BLK_DEF_MAX_SECTORS >> 1);
|
||||
BLK_DEF_MAX_SECTORS_CAP >> 1);
|
||||
} else {
|
||||
if (max_sectors_kb > max_hw_sectors_kb ||
|
||||
max_sectors_kb < page_kb)
|
||||
@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
|
||||
|
||||
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
|
||||
{
|
||||
switch (blk_queue_zoned_model(q)) {
|
||||
case BLK_ZONED_HA:
|
||||
return sprintf(page, "host-aware\n");
|
||||
case BLK_ZONED_HM:
|
||||
if (blk_queue_is_zoned(q))
|
||||
return sprintf(page, "host-managed\n");
|
||||
default:
|
||||
return sprintf(page, "none\n");
|
||||
}
|
||||
return sprintf(page, "none\n");
|
||||
}
|
||||
|
||||
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
|
||||
|
@ -84,8 +84,6 @@ struct rq_wb {
|
||||
u64 sync_issue;
|
||||
void *sync_cookie;
|
||||
|
||||
unsigned int wc;
|
||||
|
||||
unsigned long last_issue; /* last non-throttled issue */
|
||||
unsigned long last_comp; /* last non-throttled comp */
|
||||
unsigned long min_lat_nsec;
|
||||
@ -207,7 +205,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
|
||||
*/
|
||||
if (wb_acct & WBT_DISCARD)
|
||||
limit = rwb->wb_background;
|
||||
else if (rwb->wc && !wb_recent_wait(rwb))
|
||||
else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
|
||||
!wb_recent_wait(rwb))
|
||||
limit = 0;
|
||||
else
|
||||
limit = rwb->wb_normal;
|
||||
@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
|
||||
}
|
||||
}
|
||||
|
||||
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
|
||||
{
|
||||
struct rq_qos *rqos = wbt_rq_qos(q);
|
||||
if (rqos)
|
||||
RQWB(rqos)->wc = write_cache_on;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enable wbt if defaults are configured that way
|
||||
*/
|
||||
@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk)
|
||||
rwb->last_comp = rwb->last_issue = jiffies;
|
||||
rwb->win_nsec = RWB_WINDOW_NSEC;
|
||||
rwb->enable_state = WBT_STATE_ON_DEFAULT;
|
||||
rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
|
||||
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
|
||||
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
|
||||
rwb->rq_depth.queue_depth = blk_queue_depth(q);
|
||||
|
@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q);
|
||||
void wbt_set_min_lat(struct request_queue *q, u64 val);
|
||||
bool wbt_disabled(struct request_queue *);
|
||||
|
||||
void wbt_set_write_cache(struct request_queue *, bool);
|
||||
|
||||
u64 wbt_default_latency_nsec(struct request_queue *);
|
||||
|
||||
#else
|
||||
@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk)
|
||||
static inline void wbt_enable_default(struct gendisk *disk)
|
||||
{
|
||||
}
|
||||
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BLK_WBT */
|
||||
|
||||
|
@ -498,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
|
||||
set_bit(idx, args->conv_zones_bitmap);
|
||||
break;
|
||||
case BLK_ZONE_TYPE_SEQWRITE_REQ:
|
||||
case BLK_ZONE_TYPE_SEQWRITE_PREF:
|
||||
if (!args->seq_zones_wlock) {
|
||||
args->seq_zones_wlock =
|
||||
blk_alloc_zone_bitmap(q->node, args->nr_zones);
|
||||
@ -506,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
|
||||
return -ENOMEM;
|
||||
}
|
||||
break;
|
||||
case BLK_ZONE_TYPE_SEQWRITE_PREF:
|
||||
default:
|
||||
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
|
||||
disk->disk_name, (int)zone->type, zone->start);
|
||||
@ -615,22 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
|
||||
|
||||
void disk_clear_zone_settings(struct gendisk *disk)
|
||||
{
|
||||
struct request_queue *q = disk->queue;
|
||||
|
||||
blk_mq_freeze_queue(q);
|
||||
|
||||
disk_free_zone_bitmaps(disk);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
|
||||
disk->nr_zones = 0;
|
||||
disk->max_open_zones = 0;
|
||||
disk->max_active_zones = 0;
|
||||
q->limits.chunk_sectors = 0;
|
||||
q->limits.zone_write_granularity = 0;
|
||||
q->limits.max_zone_append_sectors = 0;
|
||||
|
||||
blk_mq_unfreeze_queue(q);
|
||||
}
|
||||
|
@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
void disk_free_zone_bitmaps(struct gendisk *disk);
|
||||
void disk_clear_zone_settings(struct gendisk *disk);
|
||||
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
|
||||
unsigned long arg);
|
||||
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
|
||||
unsigned int cmd, unsigned long arg);
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
|
||||
static inline void disk_clear_zone_settings(struct gendisk *disk) {}
|
||||
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
|
||||
unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
|
@ -432,7 +432,9 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
DISK_MAX_PARTS);
|
||||
disk->minors = DISK_MAX_PARTS;
|
||||
}
|
||||
if (disk->first_minor + disk->minors > MINORMASK + 1)
|
||||
if (disk->first_minor > MINORMASK ||
|
||||
disk->minors > MINORMASK + 1 ||
|
||||
disk->first_minor + disk->minors > MINORMASK + 1)
|
||||
goto out_exit_elevator;
|
||||
} else {
|
||||
if (WARN_ON(disk->minors))
|
||||
@ -542,6 +544,7 @@ out_put_holder_dir:
|
||||
kobject_put(disk->part0->bd_holder_dir);
|
||||
out_del_block_link:
|
||||
sysfs_remove_link(block_depr, dev_name(ddev));
|
||||
pm_runtime_set_memalloc_noio(ddev, false);
|
||||
out_device_del:
|
||||
device_del(ddev);
|
||||
out_free_ext_minor:
|
||||
|
@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
|
||||
{
|
||||
struct gendisk *disk = bdev->bd_disk;
|
||||
struct blkpg_partition p;
|
||||
long long start, length;
|
||||
sector_t start, length;
|
||||
|
||||
if (disk->flags & GENHD_FL_NO_PART)
|
||||
return -EINVAL;
|
||||
@ -35,14 +35,17 @@ static int blkpg_do_ioctl(struct block_device *bdev,
|
||||
if (op == BLKPG_DEL_PARTITION)
|
||||
return bdev_del_partition(disk, p.pno);
|
||||
|
||||
if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
|
||||
return -EINVAL;
|
||||
/* Check that the partition is aligned to the block size */
|
||||
if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
|
||||
return -EINVAL;
|
||||
|
||||
start = p.start >> SECTOR_SHIFT;
|
||||
length = p.length >> SECTOR_SHIFT;
|
||||
|
||||
switch (op) {
|
||||
case BLKPG_ADD_PARTITION:
|
||||
/* check if partition is aligned to blocksize */
|
||||
if (p.start & (bdev_logical_block_size(bdev) - 1))
|
||||
return -EINVAL;
|
||||
return bdev_add_partition(disk, p.pno, start, length);
|
||||
case BLKPG_RESIZE_PARTITION:
|
||||
return bdev_resize_partition(disk, p.pno, start, length);
|
||||
|
@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
|
||||
* Partitions are not supported on zoned block devices that are used as
|
||||
* such.
|
||||
*/
|
||||
switch (disk->queue->limits.zoned) {
|
||||
case BLK_ZONED_HM:
|
||||
if (bdev_is_zoned(disk->part0)) {
|
||||
pr_warn("%s: partitions not supported on host managed zoned block device\n",
|
||||
disk->disk_name);
|
||||
return ERR_PTR(-ENXIO);
|
||||
case BLK_ZONED_HA:
|
||||
pr_info("%s: disabling host aware zoned block device support due to partitions\n",
|
||||
disk->disk_name);
|
||||
disk_set_zoned(disk, BLK_ZONED_NONE);
|
||||
break;
|
||||
case BLK_ZONED_NONE:
|
||||
break;
|
||||
}
|
||||
|
||||
if (xa_load(&disk->part_tbl, partno))
|
||||
@ -613,7 +605,7 @@ static int blk_add_partitions(struct gendisk *disk)
|
||||
/*
|
||||
* Partitions are not supported on host managed zoned block devices.
|
||||
*/
|
||||
if (disk->queue->limits.zoned == BLK_ZONED_HM) {
|
||||
if (bdev_is_zoned(disk->part0)) {
|
||||
pr_warn("%s: ignoring partition table on host managed zoned block device\n",
|
||||
disk->disk_name);
|
||||
ret = 0;
|
||||
|
@ -383,7 +383,8 @@ aoeblk_gdalloc(void *vp)
|
||||
WARN_ON(d->flags & DEVFL_TKILL);
|
||||
WARN_ON(d->gd);
|
||||
WARN_ON(d->flags & DEVFL_UP);
|
||||
blk_queue_max_hw_sectors(gd->queue, BLK_DEF_MAX_SECTORS);
|
||||
/* random number picked from the history block max_sectors cap */
|
||||
blk_queue_max_hw_sectors(gd->queue, 2560u);
|
||||
blk_queue_io_opt(gd->queue, SZ_2M);
|
||||
d->bufpool = mp;
|
||||
d->blkq = gd->queue;
|
||||
|
@ -838,8 +838,8 @@ static bool plausible_request_size(int size)
|
||||
}
|
||||
|
||||
/* clear the bit corresponding to the piece of storage in question:
|
||||
* size byte of data starting from sector. Only clear a bits of the affected
|
||||
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
|
||||
* size byte of data starting from sector. Only clear bits of the affected
|
||||
* one or more _aligned_ BM_BLOCK_SIZE blocks.
|
||||
*
|
||||
* called by worker on C_SYNC_TARGET and receiver on SyncSource.
|
||||
*
|
||||
@ -957,7 +957,9 @@ static int _is_in_al(struct drbd_device *device, unsigned int enr)
|
||||
* @device: DRBD device.
|
||||
* @sector: The sector number.
|
||||
*
|
||||
* This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
|
||||
* This functions sleeps on al_wait.
|
||||
*
|
||||
* Returns: %0 on success, -EINTR if interrupted.
|
||||
*/
|
||||
int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
|
||||
{
|
||||
@ -1004,11 +1006,13 @@ retry:
|
||||
|
||||
/**
|
||||
* drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
|
||||
* @device: DRBD device.
|
||||
* @peer_device: DRBD device.
|
||||
* @sector: The sector number.
|
||||
*
|
||||
* Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
|
||||
* tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
|
||||
* tries to set it to BME_LOCKED.
|
||||
*
|
||||
* Returns: %0 upon success, and -EAGAIN
|
||||
* if there is still application IO going on in this area.
|
||||
*/
|
||||
int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector)
|
||||
@ -1190,7 +1194,7 @@ void drbd_rs_cancel_all(struct drbd_device *device)
|
||||
* drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
|
||||
* @device: DRBD device.
|
||||
*
|
||||
* Returns 0 upon success, -EAGAIN if at least one reference count was
|
||||
* Returns: %0 upon success, -EAGAIN if at least one reference count was
|
||||
* not zero.
|
||||
*/
|
||||
int drbd_rs_del_all(struct drbd_device *device)
|
||||
|
@ -1301,8 +1301,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
|
||||
loop_set_size(lo, new_size);
|
||||
}
|
||||
|
||||
loop_config_discard(lo);
|
||||
|
||||
/* update dio if lo_offset or transfer is changed */
|
||||
__loop_update_dio(lo, lo->use_dio);
|
||||
|
||||
@ -2036,7 +2034,8 @@ static int loop_add(int i)
|
||||
}
|
||||
lo->lo_queue = lo->lo_disk->queue;
|
||||
|
||||
blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
|
||||
/* random number picked from the history block max_sectors cap */
|
||||
blk_queue_max_hw_sectors(lo->lo_queue, 2560u);
|
||||
|
||||
/*
|
||||
* By default, we do buffer IO, so it doesn't make sense to enable
|
||||
|
@ -334,10 +334,8 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
|
||||
if (!nbd->pid)
|
||||
return 0;
|
||||
|
||||
if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
|
||||
nbd->disk->queue->limits.discard_granularity = blksize;
|
||||
if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
|
||||
blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
|
||||
}
|
||||
blk_queue_logical_block_size(nbd->disk->queue, blksize);
|
||||
blk_queue_physical_block_size(nbd->disk->queue, blksize);
|
||||
|
||||
@ -1357,7 +1355,6 @@ static void nbd_config_put(struct nbd_device *nbd)
|
||||
nbd->config = NULL;
|
||||
|
||||
nbd->tag_set.timeout = 0;
|
||||
nbd->disk->queue->limits.discard_granularity = 0;
|
||||
blk_queue_max_discard_sectors(nbd->disk->queue, 0);
|
||||
|
||||
mutex_unlock(&nbd->config_lock);
|
||||
@ -1850,7 +1847,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
|
||||
* Tell the block layer that we are not a rotational device
|
||||
*/
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
|
||||
disk->queue->limits.discard_granularity = 0;
|
||||
blk_queue_max_discard_sectors(disk->queue, 0);
|
||||
blk_queue_max_segment_size(disk->queue, UINT_MAX);
|
||||
blk_queue_max_segments(disk->queue, USHRT_MAX);
|
||||
|
@ -1880,7 +1880,6 @@ static void null_config_discard(struct nullb *nullb)
|
||||
return;
|
||||
}
|
||||
|
||||
nullb->q->limits.discard_granularity = nullb->dev->blocksize;
|
||||
blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
|
||||
}
|
||||
|
||||
@ -2186,10 +2185,8 @@ static int null_add_dev(struct nullb_device *dev)
|
||||
|
||||
blk_queue_logical_block_size(nullb->q, dev->blocksize);
|
||||
blk_queue_physical_block_size(nullb->q, dev->blocksize);
|
||||
if (!dev->max_sectors)
|
||||
dev->max_sectors = queue_max_hw_sectors(nullb->q);
|
||||
dev->max_sectors = min(dev->max_sectors, BLK_DEF_MAX_SECTORS);
|
||||
blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
|
||||
if (dev->max_sectors)
|
||||
blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
|
||||
|
||||
if (dev->virt_boundary)
|
||||
blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
|
||||
@ -2289,12 +2286,6 @@ static int __init null_init(void)
|
||||
g_bs = PAGE_SIZE;
|
||||
}
|
||||
|
||||
if (g_max_sectors > BLK_DEF_MAX_SECTORS) {
|
||||
pr_warn("invalid max sectors\n");
|
||||
pr_warn("defaults max sectors to %u\n", BLK_DEF_MAX_SECTORS);
|
||||
g_max_sectors = BLK_DEF_MAX_SECTORS;
|
||||
}
|
||||
|
||||
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
|
||||
pr_err("invalid home_node value\n");
|
||||
g_home_node = NUMA_NO_NODE;
|
||||
|
@ -159,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb)
|
||||
struct nullb_device *dev = nullb->dev;
|
||||
struct request_queue *q = nullb->q;
|
||||
|
||||
disk_set_zoned(nullb->disk, BLK_ZONED_HM);
|
||||
disk_set_zoned(nullb->disk);
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
|
||||
blk_queue_chunk_sectors(q, dev->zone_size_sects);
|
||||
|
@ -1006,10 +1006,10 @@ static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
|
||||
msg.prio = cpu_to_le16(req_get_ioprio(rq));
|
||||
|
||||
/*
|
||||
* We only support discards with single segment for now.
|
||||
* We only support discards/WRITE_ZEROES with single segment for now.
|
||||
* See queue limits.
|
||||
*/
|
||||
if (req_op(rq) != REQ_OP_DISCARD)
|
||||
if ((req_op(rq) != REQ_OP_DISCARD) && (req_op(rq) != REQ_OP_WRITE_ZEROES))
|
||||
sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl);
|
||||
|
||||
if (sg_cnt == 0)
|
||||
@ -1362,6 +1362,8 @@ static void setup_request_queue(struct rnbd_clt_dev *dev,
|
||||
blk_queue_write_cache(dev->queue,
|
||||
!!(rsp->cache_policy & RNBD_WRITEBACK),
|
||||
!!(rsp->cache_policy & RNBD_FUA));
|
||||
blk_queue_max_write_zeroes_sectors(dev->queue,
|
||||
le32_to_cpu(rsp->max_write_zeroes_sectors));
|
||||
}
|
||||
|
||||
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
|
||||
@ -1567,8 +1569,8 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
|
||||
|
||||
dev = init_dev(sess, access_mode, pathname, nr_poll_queues);
|
||||
if (IS_ERR(dev)) {
|
||||
pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n",
|
||||
pathname, sess->sessname, PTR_ERR(dev));
|
||||
pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %pe\n",
|
||||
pathname, sess->sessname, dev);
|
||||
ret = PTR_ERR(dev);
|
||||
goto put_sess;
|
||||
}
|
||||
@ -1626,10 +1628,11 @@ struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
|
||||
}
|
||||
|
||||
rnbd_clt_info(dev,
|
||||
"map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
|
||||
"map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_write_zeroes_sectors: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
|
||||
dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
|
||||
le16_to_cpu(rsp->logical_block_size),
|
||||
le16_to_cpu(rsp->physical_block_size),
|
||||
le32_to_cpu(rsp->max_write_zeroes_sectors),
|
||||
le32_to_cpu(rsp->max_discard_sectors),
|
||||
le32_to_cpu(rsp->discard_granularity),
|
||||
le32_to_cpu(rsp->discard_alignment),
|
||||
|
@ -128,7 +128,7 @@ enum rnbd_cache_policy {
|
||||
* @device_id: device_id on server side to identify the device
|
||||
* @nsectors: number of sectors in the usual 512b unit
|
||||
* @max_hw_sectors: max hardware sectors in the usual 512b unit
|
||||
* @max_write_same_sectors: max sectors for WRITE SAME in the 512b unit
|
||||
* @max_write_zeroes_sectors: max sectors for WRITE ZEROES in the 512b unit
|
||||
* @max_discard_sectors: max. sectors that can be discarded at once in 512b
|
||||
* unit.
|
||||
* @discard_granularity: size of the internal discard allocation unit in bytes
|
||||
@ -145,7 +145,7 @@ struct rnbd_msg_open_rsp {
|
||||
__le32 device_id;
|
||||
__le64 nsectors;
|
||||
__le32 max_hw_sectors;
|
||||
__le32 max_write_same_sectors;
|
||||
__le32 max_write_zeroes_sectors;
|
||||
__le32 max_discard_sectors;
|
||||
__le32 discard_granularity;
|
||||
__le32 discard_alignment;
|
||||
@ -186,7 +186,7 @@ struct rnbd_msg_io {
|
||||
* @RNBD_OP_FLUSH: flush the volatile write cache
|
||||
* @RNBD_OP_DISCARD: discard sectors
|
||||
* @RNBD_OP_SECURE_ERASE: securely erase sectors
|
||||
* @RNBD_OP_WRITE_SAME: write the same sectors many times
|
||||
* @RNBD_OP_WRITE_ZEROES: write zeroes sectors
|
||||
|
||||
* @RNBD_F_SYNC: request is sync (sync write or read)
|
||||
* @RNBD_F_FUA: forced unit access
|
||||
@ -199,7 +199,7 @@ enum rnbd_io_flags {
|
||||
RNBD_OP_FLUSH = 2,
|
||||
RNBD_OP_DISCARD = 3,
|
||||
RNBD_OP_SECURE_ERASE = 4,
|
||||
RNBD_OP_WRITE_SAME = 5,
|
||||
RNBD_OP_WRITE_ZEROES = 5,
|
||||
|
||||
/* Flags */
|
||||
RNBD_F_SYNC = 1<<(RNBD_OP_BITS + 0),
|
||||
@ -236,6 +236,9 @@ static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
|
||||
case RNBD_OP_SECURE_ERASE:
|
||||
bio_opf = REQ_OP_SECURE_ERASE;
|
||||
break;
|
||||
case RNBD_OP_WRITE_ZEROES:
|
||||
bio_opf = REQ_OP_WRITE_ZEROES;
|
||||
break;
|
||||
default:
|
||||
WARN(1, "Unknown RNBD type: %d (flags %d)\n",
|
||||
rnbd_op(rnbd_opf), rnbd_opf);
|
||||
@ -268,6 +271,9 @@ static inline u32 rq_to_rnbd_flags(struct request *rq)
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
rnbd_opf = RNBD_OP_SECURE_ERASE;
|
||||
break;
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
rnbd_opf = RNBD_OP_WRITE_ZEROES;
|
||||
break;
|
||||
case REQ_OP_FLUSH:
|
||||
rnbd_opf = RNBD_OP_FLUSH;
|
||||
break;
|
||||
|
@ -136,8 +136,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
|
||||
|
||||
sess_dev = rnbd_get_sess_dev(dev_id, srv_sess);
|
||||
if (IS_ERR(sess_dev)) {
|
||||
pr_err_ratelimited("Got I/O request on session %s for unknown device id %d\n",
|
||||
srv_sess->sessname, dev_id);
|
||||
pr_err_ratelimited("Got I/O request on session %s for unknown device id %d: %pe\n",
|
||||
srv_sess->sessname, dev_id, sess_dev);
|
||||
err = -ENOTCONN;
|
||||
goto err;
|
||||
}
|
||||
@ -544,7 +544,8 @@ static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
|
||||
rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev));
|
||||
rsp->max_hw_sectors =
|
||||
cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev)));
|
||||
rsp->max_write_same_sectors = 0;
|
||||
rsp->max_write_zeroes_sectors =
|
||||
cpu_to_le32(bdev_write_zeroes_sectors(bdev));
|
||||
rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev));
|
||||
rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev));
|
||||
rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev));
|
||||
@ -585,6 +586,7 @@ static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess,
|
||||
{
|
||||
char *full_path;
|
||||
char *a, *b;
|
||||
int len;
|
||||
|
||||
full_path = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||
if (!full_path)
|
||||
@ -596,19 +598,19 @@ static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess,
|
||||
*/
|
||||
a = strnstr(dev_search_path, "%SESSNAME%", sizeof(dev_search_path));
|
||||
if (a) {
|
||||
int len = a - dev_search_path;
|
||||
len = a - dev_search_path;
|
||||
|
||||
len = snprintf(full_path, PATH_MAX, "%.*s/%s/%s", len,
|
||||
dev_search_path, srv_sess->sessname, dev_name);
|
||||
if (len >= PATH_MAX) {
|
||||
pr_err("Too long path: %s, %s, %s\n",
|
||||
dev_search_path, srv_sess->sessname, dev_name);
|
||||
kfree(full_path);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
} else {
|
||||
snprintf(full_path, PATH_MAX, "%s/%s",
|
||||
dev_search_path, dev_name);
|
||||
len = snprintf(full_path, PATH_MAX, "%s/%s",
|
||||
dev_search_path, dev_name);
|
||||
}
|
||||
if (len >= PATH_MAX) {
|
||||
pr_err("Too long path: %s, %s, %s\n",
|
||||
dev_search_path, srv_sess->sessname, dev_name);
|
||||
kfree(full_path);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
|
||||
/* eliminitate duplicated slashes */
|
||||
@ -709,24 +711,24 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
|
||||
full_path = rnbd_srv_get_full_path(srv_sess, open_msg->dev_name);
|
||||
if (IS_ERR(full_path)) {
|
||||
ret = PTR_ERR(full_path);
|
||||
pr_err("Opening device '%s' for client %s failed, failed to get device full path, err: %d\n",
|
||||
open_msg->dev_name, srv_sess->sessname, ret);
|
||||
pr_err("Opening device '%s' for client %s failed, failed to get device full path, err: %pe\n",
|
||||
open_msg->dev_name, srv_sess->sessname, full_path);
|
||||
goto reject;
|
||||
}
|
||||
|
||||
bdev_handle = bdev_open_by_path(full_path, open_flags, NULL, NULL);
|
||||
if (IS_ERR(bdev_handle)) {
|
||||
ret = PTR_ERR(bdev_handle);
|
||||
pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n",
|
||||
full_path, srv_sess->sessname, ret);
|
||||
pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %pe\n",
|
||||
full_path, srv_sess->sessname, bdev_handle);
|
||||
goto free_path;
|
||||
}
|
||||
|
||||
srv_dev = rnbd_srv_get_or_create_srv_dev(bdev_handle->bdev, srv_sess,
|
||||
open_msg->access_mode);
|
||||
if (IS_ERR(srv_dev)) {
|
||||
pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n",
|
||||
full_path, srv_sess->sessname, PTR_ERR(srv_dev));
|
||||
pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %pe\n",
|
||||
full_path, srv_sess->sessname, srv_dev);
|
||||
ret = PTR_ERR(srv_dev);
|
||||
goto blkdev_put;
|
||||
}
|
||||
@ -736,8 +738,8 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
|
||||
open_msg->access_mode == RNBD_ACCESS_RO,
|
||||
srv_dev);
|
||||
if (IS_ERR(srv_sess_dev)) {
|
||||
pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n",
|
||||
full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev));
|
||||
pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %pe\n",
|
||||
full_path, srv_sess->sessname, srv_sess_dev);
|
||||
ret = PTR_ERR(srv_sess_dev);
|
||||
goto srv_dev_put;
|
||||
}
|
||||
@ -818,7 +820,7 @@ static int __init rnbd_srv_init_module(void)
|
||||
};
|
||||
rtrs_ctx = rtrs_srv_open(&rtrs_ops, port_nr);
|
||||
if (IS_ERR(rtrs_ctx)) {
|
||||
pr_err("rtrs_srv_open(), err: %d\n", err);
|
||||
pr_err("rtrs_srv_open(), err: %pe\n", rtrs_ctx);
|
||||
return PTR_ERR(rtrs_ctx);
|
||||
}
|
||||
|
||||
|
@ -250,7 +250,7 @@ static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
|
||||
{
|
||||
const struct ublk_param_zoned *p = &ub->params.zoned;
|
||||
|
||||
disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
|
||||
disk_set_zoned(ub->ub_disk);
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
|
||||
blk_queue_required_elevator_features(ub->ub_disk->queue,
|
||||
ELEVATOR_F_ZBD_SEQ_WRITE);
|
||||
|
@ -722,52 +722,15 @@ fail_report:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void virtblk_revalidate_zones(struct virtio_blk *vblk)
|
||||
{
|
||||
u8 model;
|
||||
|
||||
virtio_cread(vblk->vdev, struct virtio_blk_config,
|
||||
zoned.model, &model);
|
||||
switch (model) {
|
||||
default:
|
||||
dev_err(&vblk->vdev->dev, "unknown zone model %d\n", model);
|
||||
fallthrough;
|
||||
case VIRTIO_BLK_Z_NONE:
|
||||
case VIRTIO_BLK_Z_HA:
|
||||
disk_set_zoned(vblk->disk, BLK_ZONED_NONE);
|
||||
return;
|
||||
case VIRTIO_BLK_Z_HM:
|
||||
WARN_ON_ONCE(!vblk->zone_sectors);
|
||||
if (!blk_revalidate_disk_zones(vblk->disk, NULL))
|
||||
set_capacity_and_notify(vblk->disk, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static int virtblk_probe_zoned_device(struct virtio_device *vdev,
|
||||
struct virtio_blk *vblk,
|
||||
struct request_queue *q)
|
||||
{
|
||||
u32 v, wg;
|
||||
u8 model;
|
||||
|
||||
virtio_cread(vdev, struct virtio_blk_config,
|
||||
zoned.model, &model);
|
||||
|
||||
switch (model) {
|
||||
case VIRTIO_BLK_Z_NONE:
|
||||
case VIRTIO_BLK_Z_HA:
|
||||
/* Present the host-aware device as non-zoned */
|
||||
return 0;
|
||||
case VIRTIO_BLK_Z_HM:
|
||||
break;
|
||||
default:
|
||||
dev_err(&vdev->dev, "unsupported zone model %d\n", model);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
|
||||
|
||||
disk_set_zoned(vblk->disk, BLK_ZONED_HM);
|
||||
disk_set_zoned(vblk->disk);
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
|
||||
virtio_cread(vdev, struct virtio_blk_config,
|
||||
@ -839,23 +802,12 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
|
||||
*/
|
||||
#define virtblk_report_zones NULL
|
||||
|
||||
static inline void virtblk_revalidate_zones(struct virtio_blk *vblk)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
|
||||
struct virtio_blk *vblk, struct request_queue *q)
|
||||
{
|
||||
u8 model;
|
||||
|
||||
virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
|
||||
if (model == VIRTIO_BLK_Z_HM) {
|
||||
dev_err(&vdev->dev,
|
||||
"virtio_blk: zoned devices are not supported");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
return 0;
|
||||
dev_err(&vdev->dev,
|
||||
"virtio_blk: zoned devices are not supported");
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
@ -1005,7 +957,6 @@ static void virtblk_config_changed_work(struct work_struct *work)
|
||||
struct virtio_blk *vblk =
|
||||
container_of(work, struct virtio_blk, config_work);
|
||||
|
||||
virtblk_revalidate_zones(vblk);
|
||||
virtblk_update_capacity(vblk, true);
|
||||
}
|
||||
|
||||
@ -1570,9 +1521,26 @@ static int virtblk_probe(struct virtio_device *vdev)
|
||||
* placed after the virtio_device_ready() call above.
|
||||
*/
|
||||
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
|
||||
err = virtblk_probe_zoned_device(vdev, vblk, q);
|
||||
if (err)
|
||||
u8 model;
|
||||
|
||||
virtio_cread(vdev, struct virtio_blk_config, zoned.model,
|
||||
&model);
|
||||
switch (model) {
|
||||
case VIRTIO_BLK_Z_NONE:
|
||||
case VIRTIO_BLK_Z_HA:
|
||||
/* Present the host-aware device as non-zoned */
|
||||
break;
|
||||
case VIRTIO_BLK_Z_HM:
|
||||
err = virtblk_probe_zoned_device(vdev, vblk, q);
|
||||
if (err)
|
||||
goto out_cleanup_disk;
|
||||
break;
|
||||
default:
|
||||
dev_err(&vdev->dev, "unsupported zone model %d\n",
|
||||
model);
|
||||
err = -EINVAL;
|
||||
goto out_cleanup_disk;
|
||||
}
|
||||
}
|
||||
|
||||
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
|
||||
|
@ -132,7 +132,7 @@ struct blkif_x86_32_request {
|
||||
struct blkif_x86_64_request_rw {
|
||||
uint8_t nr_segments; /* number of segments */
|
||||
blkif_vdev_t handle; /* only for read/write requests */
|
||||
uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */
|
||||
uint32_t _pad1; /* offsetof(blkif_request..,u.rw.id)==8 */
|
||||
uint64_t id;
|
||||
blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
|
||||
struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
|
||||
|
@ -2226,7 +2226,6 @@ static int zram_add(void)
|
||||
ZRAM_LOGICAL_BLOCK_SIZE);
|
||||
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
|
||||
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
|
||||
zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
|
||||
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
|
||||
|
||||
/*
|
||||
|
@ -61,19 +61,6 @@ config MD_BITMAP_FILE
|
||||
various kernel APIs and can only work with files on a file system not
|
||||
actually sitting on the MD device.
|
||||
|
||||
config MD_LINEAR
|
||||
tristate "Linear (append) mode (deprecated)"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
If you say Y here, then your multiple devices driver will be able to
|
||||
use the so-called linear mode, i.e. it will combine the hard disk
|
||||
partitions by simply appending one to the other.
|
||||
|
||||
To compile this as a module, choose M here: the module
|
||||
will be called linear.
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_RAID0
|
||||
tristate "RAID-0 (striping) mode"
|
||||
depends on BLK_DEV_MD
|
||||
@ -172,27 +159,6 @@ config MD_RAID456
|
||||
|
||||
If unsure, say Y.
|
||||
|
||||
config MD_MULTIPATH
|
||||
tristate "Multipath I/O support (deprecated)"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
MD_MULTIPATH provides a simple multi-path personality for use
|
||||
the MD framework. It is not under active development. New
|
||||
projects should consider using DM_MULTIPATH which has more
|
||||
features and more testing.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MD_FAULTY
|
||||
tristate "Faulty test module for MD (deprecated)"
|
||||
depends on BLK_DEV_MD
|
||||
help
|
||||
The "faulty" module allows for a block device that occasionally returns
|
||||
read or write errors. It is useful for testing.
|
||||
|
||||
In unsure, say N.
|
||||
|
||||
|
||||
config MD_CLUSTER
|
||||
tristate "Cluster Support for MD"
|
||||
depends on BLK_DEV_MD
|
||||
|
@ -29,22 +29,16 @@ dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
|
||||
|
||||
md-mod-y += md.o md-bitmap.o
|
||||
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
|
||||
linear-y += md-linear.o
|
||||
multipath-y += md-multipath.o
|
||||
faulty-y += md-faulty.o
|
||||
|
||||
# Note: link order is important. All raid personalities
|
||||
# and must come before md.o, as they each initialise
|
||||
# themselves, and md.o may use the personalities when it
|
||||
# and must come before md.o, as they each initialise
|
||||
# themselves, and md.o may use the personalities when it
|
||||
# auto-initialised.
|
||||
|
||||
obj-$(CONFIG_MD_LINEAR) += linear.o
|
||||
obj-$(CONFIG_MD_RAID0) += raid0.o
|
||||
obj-$(CONFIG_MD_RAID1) += raid1.o
|
||||
obj-$(CONFIG_MD_RAID10) += raid10.o
|
||||
obj-$(CONFIG_MD_RAID456) += raid456.o
|
||||
obj-$(CONFIG_MD_MULTIPATH) += multipath.o
|
||||
obj-$(CONFIG_MD_FAULTY) += faulty.o
|
||||
obj-$(CONFIG_MD_CLUSTER) += md-cluster.o
|
||||
obj-$(CONFIG_BCACHE) += bcache/
|
||||
obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
|
||||
|
@ -954,7 +954,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
|
||||
q->limits.max_segment_size = UINT_MAX;
|
||||
q->limits.max_segments = BIO_MAX_VECS;
|
||||
blk_queue_max_discard_sectors(q, UINT_MAX);
|
||||
q->limits.discard_granularity = 512;
|
||||
q->limits.io_min = block_size;
|
||||
q->limits.logical_block_size = block_size;
|
||||
q->limits.physical_block_size = block_size;
|
||||
|
@ -807,7 +807,7 @@ void dm_kcopyd_copy(struct dm_kcopyd_client *kc, struct dm_io_region *from,
|
||||
*/
|
||||
if (!(job->flags & BIT(DM_KCOPYD_WRITE_SEQ))) {
|
||||
for (i = 0; i < job->num_dests; i++) {
|
||||
if (bdev_zoned_model(dests[i].bdev) == BLK_ZONED_HM) {
|
||||
if (bdev_is_zoned(dests[i].bdev)) {
|
||||
job->flags |= BIT(DM_KCOPYD_WRITE_SEQ);
|
||||
break;
|
||||
}
|
||||
|
@ -1579,21 +1579,18 @@ bool dm_table_has_no_data_devices(struct dm_table *t)
|
||||
return true;
|
||||
}
|
||||
|
||||
static int device_not_zoned_model(struct dm_target *ti, struct dm_dev *dev,
|
||||
sector_t start, sector_t len, void *data)
|
||||
static int device_not_zoned(struct dm_target *ti, struct dm_dev *dev,
|
||||
sector_t start, sector_t len, void *data)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(dev->bdev);
|
||||
enum blk_zoned_model *zoned_model = data;
|
||||
bool *zoned = data;
|
||||
|
||||
return blk_queue_zoned_model(q) != *zoned_model;
|
||||
return bdev_is_zoned(dev->bdev) != *zoned;
|
||||
}
|
||||
|
||||
static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
|
||||
sector_t start, sector_t len, void *data)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(dev->bdev);
|
||||
|
||||
return blk_queue_zoned_model(q) != BLK_ZONED_NONE;
|
||||
return bdev_is_zoned(dev->bdev);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1603,8 +1600,7 @@ static int device_is_zoned_model(struct dm_target *ti, struct dm_dev *dev,
|
||||
* has the DM_TARGET_MIXED_ZONED_MODEL feature set, the devices can have any
|
||||
* zoned model with all zoned devices having the same zone size.
|
||||
*/
|
||||
static bool dm_table_supports_zoned_model(struct dm_table *t,
|
||||
enum blk_zoned_model zoned_model)
|
||||
static bool dm_table_supports_zoned(struct dm_table *t, bool zoned)
|
||||
{
|
||||
for (unsigned int i = 0; i < t->num_targets; i++) {
|
||||
struct dm_target *ti = dm_table_get_target(t, i);
|
||||
@ -1623,11 +1619,11 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
|
||||
|
||||
if (dm_target_supports_zoned_hm(ti->type)) {
|
||||
if (!ti->type->iterate_devices ||
|
||||
ti->type->iterate_devices(ti, device_not_zoned_model,
|
||||
&zoned_model))
|
||||
ti->type->iterate_devices(ti, device_not_zoned,
|
||||
&zoned))
|
||||
return false;
|
||||
} else if (!dm_target_supports_mixed_zoned_model(ti->type)) {
|
||||
if (zoned_model == BLK_ZONED_HM)
|
||||
if (zoned)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -1650,14 +1646,13 @@ static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *
|
||||
* zone sectors, if the destination device is a zoned block device, it shall
|
||||
* have the specified zone_sectors.
|
||||
*/
|
||||
static int validate_hardware_zoned_model(struct dm_table *t,
|
||||
enum blk_zoned_model zoned_model,
|
||||
unsigned int zone_sectors)
|
||||
static int validate_hardware_zoned(struct dm_table *t, bool zoned,
|
||||
unsigned int zone_sectors)
|
||||
{
|
||||
if (zoned_model == BLK_ZONED_NONE)
|
||||
if (!zoned)
|
||||
return 0;
|
||||
|
||||
if (!dm_table_supports_zoned_model(t, zoned_model)) {
|
||||
if (!dm_table_supports_zoned(t, zoned)) {
|
||||
DMERR("%s: zoned model is not consistent across all devices",
|
||||
dm_device_name(t->md));
|
||||
return -EINVAL;
|
||||
@ -1683,8 +1678,8 @@ int dm_calculate_queue_limits(struct dm_table *t,
|
||||
struct queue_limits *limits)
|
||||
{
|
||||
struct queue_limits ti_limits;
|
||||
enum blk_zoned_model zoned_model = BLK_ZONED_NONE;
|
||||
unsigned int zone_sectors = 0;
|
||||
bool zoned = false;
|
||||
|
||||
blk_set_stacking_limits(limits);
|
||||
|
||||
@ -1706,12 +1701,12 @@ int dm_calculate_queue_limits(struct dm_table *t,
|
||||
ti->type->iterate_devices(ti, dm_set_device_limits,
|
||||
&ti_limits);
|
||||
|
||||
if (zoned_model == BLK_ZONED_NONE && ti_limits.zoned != BLK_ZONED_NONE) {
|
||||
if (!zoned && ti_limits.zoned) {
|
||||
/*
|
||||
* After stacking all limits, validate all devices
|
||||
* in table support this zoned model and zone sectors.
|
||||
*/
|
||||
zoned_model = ti_limits.zoned;
|
||||
zoned = ti_limits.zoned;
|
||||
zone_sectors = ti_limits.chunk_sectors;
|
||||
}
|
||||
|
||||
@ -1744,18 +1739,18 @@ combine_limits:
|
||||
* Verify that the zoned model and zone sectors, as determined before
|
||||
* any .io_hints override, are the same across all devices in the table.
|
||||
* - this is especially relevant if .io_hints is emulating a disk-managed
|
||||
* zoned model (aka BLK_ZONED_NONE) on host-managed zoned block devices.
|
||||
* zoned model on host-managed zoned block devices.
|
||||
* BUT...
|
||||
*/
|
||||
if (limits->zoned != BLK_ZONED_NONE) {
|
||||
if (limits->zoned) {
|
||||
/*
|
||||
* ...IF the above limits stacking determined a zoned model
|
||||
* validate that all of the table's devices conform to it.
|
||||
*/
|
||||
zoned_model = limits->zoned;
|
||||
zoned = limits->zoned;
|
||||
zone_sectors = limits->chunk_sectors;
|
||||
}
|
||||
if (validate_hardware_zoned_model(t, zoned_model, zone_sectors))
|
||||
if (validate_hardware_zoned(t, zoned, zone_sectors))
|
||||
return -EINVAL;
|
||||
|
||||
return validate_hardware_logical_block_alignment(t, limits);
|
||||
|
@ -2836,12 +2836,11 @@ static void dmz_print_dev(struct dmz_metadata *zmd, int num)
|
||||
{
|
||||
struct dmz_dev *dev = &zmd->dev[num];
|
||||
|
||||
if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
|
||||
if (!bdev_is_zoned(dev->bdev))
|
||||
dmz_dev_info(dev, "Regular block device");
|
||||
else
|
||||
dmz_dev_info(dev, "Host-%s zoned block device",
|
||||
bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
|
||||
"aware" : "managed");
|
||||
dmz_dev_info(dev, "Host-managed zoned block device");
|
||||
|
||||
if (zmd->sb_version > 1) {
|
||||
sector_t sector_offset =
|
||||
dev->zone_offset << zmd->zone_nr_sectors_shift;
|
||||
|
@ -702,7 +702,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path,
|
||||
}
|
||||
|
||||
bdev = ddev->bdev;
|
||||
if (bdev_zoned_model(bdev) == BLK_ZONED_NONE) {
|
||||
if (!bdev_is_zoned(bdev)) {
|
||||
if (nr_devs == 1) {
|
||||
ti->error = "Invalid regular device";
|
||||
goto err;
|
||||
@ -1010,7 +1010,7 @@ static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
||||
limits->max_sectors = chunk_sectors;
|
||||
|
||||
/* We are exposing a drive-managed zoned block device */
|
||||
limits->zoned = BLK_ZONED_NONE;
|
||||
limits->zoned = false;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -49,7 +49,6 @@ static int md_setup_ents __initdata;
|
||||
* instead of just one. -- KTK
|
||||
* 18May2000: Added support for persistent-superblock arrays:
|
||||
* md=n,0,factor,fault,device-list uses RAID0 for device n
|
||||
* md=n,-1,factor,fault,device-list uses LINEAR for device n
|
||||
* md=n,device-list reads a RAID superblock from the devices
|
||||
* elements in device-list are read by name_to_kdev_t so can be
|
||||
* a hex number or something like /dev/hda1 /dev/sdb
|
||||
@ -88,7 +87,7 @@ static int __init md_setup(char *str)
|
||||
md_setup_ents++;
|
||||
switch (get_option(&str, &level)) { /* RAID level */
|
||||
case 2: /* could be 0 or -1.. */
|
||||
if (level == 0 || level == LEVEL_LINEAR) {
|
||||
if (level == 0) {
|
||||
if (get_option(&str, &factor) != 2 || /* Chunk Size */
|
||||
get_option(&str, &fault) != 2) {
|
||||
printk(KERN_WARNING "md: Too few arguments supplied to md=.\n");
|
||||
@ -96,10 +95,7 @@ static int __init md_setup(char *str)
|
||||
}
|
||||
md_setup_args[ent].level = level;
|
||||
md_setup_args[ent].chunk = 1 << (factor+12);
|
||||
if (level == LEVEL_LINEAR)
|
||||
pername = "linear";
|
||||
else
|
||||
pername = "raid0";
|
||||
pername = "raid0";
|
||||
break;
|
||||
}
|
||||
fallthrough;
|
||||
|
@ -1,365 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* faulty.c : Multiple Devices driver for Linux
|
||||
*
|
||||
* Copyright (C) 2004 Neil Brown
|
||||
*
|
||||
* fautly-device-simulator personality for md
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* The "faulty" personality causes some requests to fail.
|
||||
*
|
||||
* Possible failure modes are:
|
||||
* reads fail "randomly" but succeed on retry
|
||||
* writes fail "randomly" but succeed on retry
|
||||
* reads for some address fail and then persist until a write
|
||||
* reads for some address fail and then persist irrespective of write
|
||||
* writes for some address fail and persist
|
||||
* all writes fail
|
||||
*
|
||||
* Different modes can be active at a time, but only
|
||||
* one can be set at array creation. Others can be added later.
|
||||
* A mode can be one-shot or recurrent with the recurrence being
|
||||
* once in every N requests.
|
||||
* The bottom 5 bits of the "layout" indicate the mode. The
|
||||
* remainder indicate a period, or 0 for one-shot.
|
||||
*
|
||||
* There is an implementation limit on the number of concurrently
|
||||
* persisting-faulty blocks. When a new fault is requested that would
|
||||
* exceed the limit, it is ignored.
|
||||
* All current faults can be clear using a layout of "0".
|
||||
*
|
||||
* Requests are always sent to the device. If they are to fail,
|
||||
* we clone the bio and insert a new b_end_io into the chain.
|
||||
*/
|
||||
|
||||
#define WriteTransient 0
|
||||
#define ReadTransient 1
|
||||
#define WritePersistent 2
|
||||
#define ReadPersistent 3
|
||||
#define WriteAll 4 /* doesn't go to device */
|
||||
#define ReadFixable 5
|
||||
#define Modes 6
|
||||
|
||||
#define ClearErrors 31
|
||||
#define ClearFaults 30
|
||||
|
||||
#define AllPersist 100 /* internal use only */
|
||||
#define NoPersist 101
|
||||
|
||||
#define ModeMask 0x1f
|
||||
#define ModeShift 5
|
||||
|
||||
#define MaxFault 50
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/raid/md_u.h>
|
||||
#include <linux/slab.h>
|
||||
#include "md.h"
|
||||
#include <linux/seq_file.h>
|
||||
|
||||
|
||||
static void faulty_fail(struct bio *bio)
|
||||
{
|
||||
struct bio *b = bio->bi_private;
|
||||
|
||||
b->bi_iter.bi_size = bio->bi_iter.bi_size;
|
||||
b->bi_iter.bi_sector = bio->bi_iter.bi_sector;
|
||||
|
||||
bio_put(bio);
|
||||
|
||||
bio_io_error(b);
|
||||
}
|
||||
|
||||
struct faulty_conf {
|
||||
int period[Modes];
|
||||
atomic_t counters[Modes];
|
||||
sector_t faults[MaxFault];
|
||||
int modes[MaxFault];
|
||||
int nfaults;
|
||||
struct md_rdev *rdev;
|
||||
};
|
||||
|
||||
static int check_mode(struct faulty_conf *conf, int mode)
|
||||
{
|
||||
if (conf->period[mode] == 0 &&
|
||||
atomic_read(&conf->counters[mode]) <= 0)
|
||||
return 0; /* no failure, no decrement */
|
||||
|
||||
|
||||
if (atomic_dec_and_test(&conf->counters[mode])) {
|
||||
if (conf->period[mode])
|
||||
atomic_set(&conf->counters[mode], conf->period[mode]);
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir)
|
||||
{
|
||||
/* If we find a ReadFixable sector, we fix it ... */
|
||||
int i;
|
||||
for (i=0; i<conf->nfaults; i++)
|
||||
if (conf->faults[i] >= start &&
|
||||
conf->faults[i] < end) {
|
||||
/* found it ... */
|
||||
switch (conf->modes[i] * 2 + dir) {
|
||||
case WritePersistent*2+WRITE: return 1;
|
||||
case ReadPersistent*2+READ: return 1;
|
||||
case ReadFixable*2+READ: return 1;
|
||||
case ReadFixable*2+WRITE:
|
||||
conf->modes[i] = NoPersist;
|
||||
return 0;
|
||||
case AllPersist*2+READ:
|
||||
case AllPersist*2+WRITE: return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void add_sector(struct faulty_conf *conf, sector_t start, int mode)
|
||||
{
|
||||
int i;
|
||||
int n = conf->nfaults;
|
||||
for (i=0; i<conf->nfaults; i++)
|
||||
if (conf->faults[i] == start) {
|
||||
switch(mode) {
|
||||
case NoPersist: conf->modes[i] = mode; return;
|
||||
case WritePersistent:
|
||||
if (conf->modes[i] == ReadPersistent ||
|
||||
conf->modes[i] == ReadFixable)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = WritePersistent;
|
||||
return;
|
||||
case ReadPersistent:
|
||||
if (conf->modes[i] == WritePersistent)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = ReadPersistent;
|
||||
return;
|
||||
case ReadFixable:
|
||||
if (conf->modes[i] == WritePersistent ||
|
||||
conf->modes[i] == ReadPersistent)
|
||||
conf->modes[i] = AllPersist;
|
||||
else
|
||||
conf->modes[i] = ReadFixable;
|
||||
return;
|
||||
}
|
||||
} else if (conf->modes[i] == NoPersist)
|
||||
n = i;
|
||||
|
||||
if (n >= MaxFault)
|
||||
return;
|
||||
conf->faults[n] = start;
|
||||
conf->modes[n] = mode;
|
||||
if (conf->nfaults == n)
|
||||
conf->nfaults = n+1;
|
||||
}
|
||||
|
||||
static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct faulty_conf *conf = mddev->private;
|
||||
int failit = 0;
|
||||
|
||||
if (bio_data_dir(bio) == WRITE) {
|
||||
/* write request */
|
||||
if (atomic_read(&conf->counters[WriteAll])) {
|
||||
/* special case - don't decrement, don't submit_bio_noacct,
|
||||
* just fail immediately
|
||||
*/
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (check_sector(conf, bio->bi_iter.bi_sector,
|
||||
bio_end_sector(bio), WRITE))
|
||||
failit = 1;
|
||||
if (check_mode(conf, WritePersistent)) {
|
||||
add_sector(conf, bio->bi_iter.bi_sector,
|
||||
WritePersistent);
|
||||
failit = 1;
|
||||
}
|
||||
if (check_mode(conf, WriteTransient))
|
||||
failit = 1;
|
||||
} else {
|
||||
/* read request */
|
||||
if (check_sector(conf, bio->bi_iter.bi_sector,
|
||||
bio_end_sector(bio), READ))
|
||||
failit = 1;
|
||||
if (check_mode(conf, ReadTransient))
|
||||
failit = 1;
|
||||
if (check_mode(conf, ReadPersistent)) {
|
||||
add_sector(conf, bio->bi_iter.bi_sector,
|
||||
ReadPersistent);
|
||||
failit = 1;
|
||||
}
|
||||
if (check_mode(conf, ReadFixable)) {
|
||||
add_sector(conf, bio->bi_iter.bi_sector,
|
||||
ReadFixable);
|
||||
failit = 1;
|
||||
}
|
||||
}
|
||||
|
||||
md_account_bio(mddev, &bio);
|
||||
if (failit) {
|
||||
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
|
||||
&mddev->bio_set);
|
||||
|
||||
b->bi_private = bio;
|
||||
b->bi_end_io = faulty_fail;
|
||||
bio = b;
|
||||
} else
|
||||
bio_set_dev(bio, conf->rdev->bdev);
|
||||
|
||||
submit_bio_noacct(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void faulty_status(struct seq_file *seq, struct mddev *mddev)
|
||||
{
|
||||
struct faulty_conf *conf = mddev->private;
|
||||
int n;
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WriteTransient])) != 0)
|
||||
seq_printf(seq, " WriteTransient=%d(%d)",
|
||||
n, conf->period[WriteTransient]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadTransient])) != 0)
|
||||
seq_printf(seq, " ReadTransient=%d(%d)",
|
||||
n, conf->period[ReadTransient]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WritePersistent])) != 0)
|
||||
seq_printf(seq, " WritePersistent=%d(%d)",
|
||||
n, conf->period[WritePersistent]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0)
|
||||
seq_printf(seq, " ReadPersistent=%d(%d)",
|
||||
n, conf->period[ReadPersistent]);
|
||||
|
||||
|
||||
if ((n=atomic_read(&conf->counters[ReadFixable])) != 0)
|
||||
seq_printf(seq, " ReadFixable=%d(%d)",
|
||||
n, conf->period[ReadFixable]);
|
||||
|
||||
if ((n=atomic_read(&conf->counters[WriteAll])) != 0)
|
||||
seq_printf(seq, " WriteAll");
|
||||
|
||||
seq_printf(seq, " nfaults=%d", conf->nfaults);
|
||||
}
|
||||
|
||||
|
||||
static int faulty_reshape(struct mddev *mddev)
|
||||
{
|
||||
int mode = mddev->new_layout & ModeMask;
|
||||
int count = mddev->new_layout >> ModeShift;
|
||||
struct faulty_conf *conf = mddev->private;
|
||||
|
||||
if (mddev->new_layout < 0)
|
||||
return 0;
|
||||
|
||||
/* new layout */
|
||||
if (mode == ClearFaults)
|
||||
conf->nfaults = 0;
|
||||
else if (mode == ClearErrors) {
|
||||
int i;
|
||||
for (i=0 ; i < Modes ; i++) {
|
||||
conf->period[i] = 0;
|
||||
atomic_set(&conf->counters[i], 0);
|
||||
}
|
||||
} else if (mode < Modes) {
|
||||
conf->period[mode] = count;
|
||||
if (!count) count++;
|
||||
atomic_set(&conf->counters[mode], count);
|
||||
} else
|
||||
return -EINVAL;
|
||||
mddev->new_layout = -1;
|
||||
mddev->layout = -1; /* makes sure further changes come through */
|
||||
return 0;
|
||||
}
|
||||
|
||||
static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
||||
{
|
||||
WARN_ONCE(raid_disks,
|
||||
"%s does not support generic reshape\n", __func__);
|
||||
|
||||
if (sectors == 0)
|
||||
return mddev->dev_sectors;
|
||||
|
||||
return sectors;
|
||||
}
|
||||
|
||||
static int faulty_run(struct mddev *mddev)
|
||||
{
|
||||
struct md_rdev *rdev;
|
||||
int i;
|
||||
struct faulty_conf *conf;
|
||||
|
||||
if (md_check_no_bitmap(mddev))
|
||||
return -EINVAL;
|
||||
|
||||
conf = kmalloc(sizeof(*conf), GFP_KERNEL);
|
||||
if (!conf)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i=0; i<Modes; i++) {
|
||||
atomic_set(&conf->counters[i], 0);
|
||||
conf->period[i] = 0;
|
||||
}
|
||||
conf->nfaults = 0;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
conf->rdev = rdev;
|
||||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
}
|
||||
|
||||
md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
|
||||
mddev->private = conf;
|
||||
|
||||
faulty_reshape(mddev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void faulty_free(struct mddev *mddev, void *priv)
|
||||
{
|
||||
struct faulty_conf *conf = priv;
|
||||
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
static struct md_personality faulty_personality =
|
||||
{
|
||||
.name = "faulty",
|
||||
.level = LEVEL_FAULTY,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = faulty_make_request,
|
||||
.run = faulty_run,
|
||||
.free = faulty_free,
|
||||
.status = faulty_status,
|
||||
.check_reshape = faulty_reshape,
|
||||
.size = faulty_size,
|
||||
};
|
||||
|
||||
static int __init raid_init(void)
|
||||
{
|
||||
return register_md_personality(&faulty_personality);
|
||||
}
|
||||
|
||||
static void raid_exit(void)
|
||||
{
|
||||
unregister_md_personality(&faulty_personality);
|
||||
}
|
||||
|
||||
module_init(raid_init);
|
||||
module_exit(raid_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)");
|
||||
MODULE_ALIAS("md-personality-10"); /* faulty */
|
||||
MODULE_ALIAS("md-faulty");
|
||||
MODULE_ALIAS("md-level--5");
|
@ -1,318 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
linear.c : Multiple Devices driver for Linux
|
||||
Copyright (C) 1994-96 Marc ZYNGIER
|
||||
<zyngier@ufr-info-p7.ibp.fr> or
|
||||
<maz@gloups.fdn.fr>
|
||||
|
||||
Linear mode management functions.
|
||||
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/raid/md_u.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <trace/events/block.h>
|
||||
#include "md.h"
|
||||
#include "md-linear.h"
|
||||
|
||||
/*
|
||||
* find which device holds a particular offset
|
||||
*/
|
||||
static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector)
|
||||
{
|
||||
int lo, mid, hi;
|
||||
struct linear_conf *conf;
|
||||
|
||||
lo = 0;
|
||||
hi = mddev->raid_disks - 1;
|
||||
conf = mddev->private;
|
||||
|
||||
/*
|
||||
* Binary Search
|
||||
*/
|
||||
|
||||
while (hi > lo) {
|
||||
|
||||
mid = (hi + lo) / 2;
|
||||
if (sector < conf->disks[mid].end_sector)
|
||||
hi = mid;
|
||||
else
|
||||
lo = mid + 1;
|
||||
}
|
||||
|
||||
return conf->disks + lo;
|
||||
}
|
||||
|
||||
static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
sector_t array_sectors;
|
||||
|
||||
conf = mddev->private;
|
||||
WARN_ONCE(sectors || raid_disks,
|
||||
"%s does not support generic reshape\n", __func__);
|
||||
array_sectors = conf->array_sectors;
|
||||
|
||||
return array_sectors;
|
||||
}
|
||||
|
||||
static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
struct md_rdev *rdev;
|
||||
int i, cnt;
|
||||
|
||||
conf = kzalloc(struct_size(conf, disks, raid_disks), GFP_KERNEL);
|
||||
if (!conf)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* conf->raid_disks is copy of mddev->raid_disks. The reason to
|
||||
* keep a copy of mddev->raid_disks in struct linear_conf is,
|
||||
* mddev->raid_disks may not be consistent with pointers number of
|
||||
* conf->disks[] when it is updated in linear_add() and used to
|
||||
* iterate old conf->disks[] earray in linear_congested().
|
||||
* Here conf->raid_disks is always consitent with number of
|
||||
* pointers in conf->disks[] array, and mddev->private is updated
|
||||
* with rcu_assign_pointer() in linear_addr(), such race can be
|
||||
* avoided.
|
||||
*/
|
||||
conf->raid_disks = raid_disks;
|
||||
|
||||
cnt = 0;
|
||||
conf->array_sectors = 0;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
int j = rdev->raid_disk;
|
||||
struct dev_info *disk = conf->disks + j;
|
||||
sector_t sectors;
|
||||
|
||||
if (j < 0 || j >= raid_disks || disk->rdev) {
|
||||
pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
}
|
||||
|
||||
disk->rdev = rdev;
|
||||
if (mddev->chunk_sectors) {
|
||||
sectors = rdev->sectors;
|
||||
sector_div(sectors, mddev->chunk_sectors);
|
||||
rdev->sectors = sectors * mddev->chunk_sectors;
|
||||
}
|
||||
|
||||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
|
||||
conf->array_sectors += rdev->sectors;
|
||||
cnt++;
|
||||
}
|
||||
if (cnt != raid_disks) {
|
||||
pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
|
||||
mdname(mddev));
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Here we calculate the device offsets.
|
||||
*/
|
||||
conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
|
||||
|
||||
for (i = 1; i < raid_disks; i++)
|
||||
conf->disks[i].end_sector =
|
||||
conf->disks[i-1].end_sector +
|
||||
conf->disks[i].rdev->sectors;
|
||||
|
||||
return conf;
|
||||
|
||||
out:
|
||||
kfree(conf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int linear_run (struct mddev *mddev)
|
||||
{
|
||||
struct linear_conf *conf;
|
||||
int ret;
|
||||
|
||||
if (md_check_no_bitmap(mddev))
|
||||
return -EINVAL;
|
||||
conf = linear_conf(mddev, mddev->raid_disks);
|
||||
|
||||
if (!conf)
|
||||
return 1;
|
||||
mddev->private = conf;
|
||||
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
||||
|
||||
ret = md_integrity_register(mddev);
|
||||
if (ret) {
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int linear_add(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
/* Adding a drive to a linear array allows the array to grow.
|
||||
* It is permitted if the new drive has a matching superblock
|
||||
* already on it, with raid_disk equal to raid_disks.
|
||||
* It is achieved by creating a new linear_private_data structure
|
||||
* and swapping it in in-place of the current one.
|
||||
* The current one is never freed until the array is stopped.
|
||||
* This avoids races.
|
||||
*/
|
||||
struct linear_conf *newconf, *oldconf;
|
||||
|
||||
if (rdev->saved_raid_disk != mddev->raid_disks)
|
||||
return -EINVAL;
|
||||
|
||||
rdev->raid_disk = rdev->saved_raid_disk;
|
||||
rdev->saved_raid_disk = -1;
|
||||
|
||||
newconf = linear_conf(mddev,mddev->raid_disks+1);
|
||||
|
||||
if (!newconf)
|
||||
return -ENOMEM;
|
||||
|
||||
/* newconf->raid_disks already keeps a copy of * the increased
|
||||
* value of mddev->raid_disks, WARN_ONCE() is just used to make
|
||||
* sure of this. It is possible that oldconf is still referenced
|
||||
* in linear_congested(), therefore kfree_rcu() is used to free
|
||||
* oldconf until no one uses it anymore.
|
||||
*/
|
||||
oldconf = rcu_dereference_protected(mddev->private,
|
||||
lockdep_is_held(&mddev->reconfig_mutex));
|
||||
mddev->raid_disks++;
|
||||
WARN_ONCE(mddev->raid_disks != newconf->raid_disks,
|
||||
"copied raid_disks doesn't match mddev->raid_disks");
|
||||
rcu_assign_pointer(mddev->private, newconf);
|
||||
md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
|
||||
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
|
||||
kfree_rcu(oldconf, rcu);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void linear_free(struct mddev *mddev, void *priv)
|
||||
{
|
||||
struct linear_conf *conf = priv;
|
||||
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
static bool linear_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct dev_info *tmp_dev;
|
||||
sector_t start_sector, end_sector, data_offset;
|
||||
sector_t bio_sector = bio->bi_iter.bi_sector;
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
|
||||
&& md_flush_request(mddev, bio))
|
||||
return true;
|
||||
|
||||
tmp_dev = which_dev(mddev, bio_sector);
|
||||
start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
|
||||
end_sector = tmp_dev->end_sector;
|
||||
data_offset = tmp_dev->rdev->data_offset;
|
||||
|
||||
if (unlikely(bio_sector >= end_sector ||
|
||||
bio_sector < start_sector))
|
||||
goto out_of_bounds;
|
||||
|
||||
if (unlikely(is_rdev_broken(tmp_dev->rdev))) {
|
||||
md_error(mddev, tmp_dev->rdev);
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (unlikely(bio_end_sector(bio) > end_sector)) {
|
||||
/* This bio crosses a device boundary, so we have to split it */
|
||||
struct bio *split = bio_split(bio, end_sector - bio_sector,
|
||||
GFP_NOIO, &mddev->bio_set);
|
||||
bio_chain(split, bio);
|
||||
submit_bio_noacct(bio);
|
||||
bio = split;
|
||||
}
|
||||
|
||||
md_account_bio(mddev, &bio);
|
||||
bio_set_dev(bio, tmp_dev->rdev->bdev);
|
||||
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
|
||||
start_sector + data_offset;
|
||||
|
||||
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
|
||||
!bdev_max_discard_sectors(bio->bi_bdev))) {
|
||||
/* Just ignore it */
|
||||
bio_endio(bio);
|
||||
} else {
|
||||
if (mddev->gendisk)
|
||||
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
|
||||
bio_sector);
|
||||
mddev_check_write_zeroes(mddev, bio);
|
||||
submit_bio_noacct(bio);
|
||||
}
|
||||
return true;
|
||||
|
||||
out_of_bounds:
|
||||
pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %pg: %llu sectors, offset %llu\n",
|
||||
mdname(mddev),
|
||||
(unsigned long long)bio->bi_iter.bi_sector,
|
||||
tmp_dev->rdev->bdev,
|
||||
(unsigned long long)tmp_dev->rdev->sectors,
|
||||
(unsigned long long)start_sector);
|
||||
bio_io_error(bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void linear_status (struct seq_file *seq, struct mddev *mddev)
|
||||
{
|
||||
seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
|
||||
}
|
||||
|
||||
static void linear_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
if (!test_and_set_bit(MD_BROKEN, &mddev->flags)) {
|
||||
char *md_name = mdname(mddev);
|
||||
|
||||
pr_crit("md/linear%s: Disk failure on %pg detected, failing array.\n",
|
||||
md_name, rdev->bdev);
|
||||
}
|
||||
}
|
||||
|
||||
static void linear_quiesce(struct mddev *mddev, int state)
|
||||
{
|
||||
}
|
||||
|
||||
static struct md_personality linear_personality =
|
||||
{
|
||||
.name = "linear",
|
||||
.level = LEVEL_LINEAR,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = linear_make_request,
|
||||
.run = linear_run,
|
||||
.free = linear_free,
|
||||
.status = linear_status,
|
||||
.hot_add_disk = linear_add,
|
||||
.size = linear_size,
|
||||
.quiesce = linear_quiesce,
|
||||
.error_handler = linear_error,
|
||||
};
|
||||
|
||||
static int __init linear_init (void)
|
||||
{
|
||||
return register_md_personality (&linear_personality);
|
||||
}
|
||||
|
||||
static void linear_exit (void)
|
||||
{
|
||||
unregister_md_personality (&linear_personality);
|
||||
}
|
||||
|
||||
module_init(linear_init);
|
||||
module_exit(linear_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)");
|
||||
MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/
|
||||
MODULE_ALIAS("md-linear");
|
||||
MODULE_ALIAS("md-level--1");
|
@ -1,471 +0,0 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* multipath.c : Multiple Devices driver for Linux
|
||||
*
|
||||
* Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
|
||||
*
|
||||
* Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
|
||||
*
|
||||
* MULTIPATH management functions.
|
||||
*
|
||||
* derived from raid1.c.
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/raid/md_u.h>
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/slab.h>
|
||||
#include "md.h"
|
||||
#include "md-multipath.h"
|
||||
|
||||
#define MAX_WORK_PER_DISK 128
|
||||
|
||||
#define NR_RESERVED_BUFS 32
|
||||
|
||||
static int multipath_map (struct mpconf *conf)
|
||||
{
|
||||
int i, disks = conf->raid_disks;
|
||||
|
||||
/*
|
||||
* Later we do read balancing on the read side
|
||||
* now we use the first available disk.
|
||||
*/
|
||||
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
if (rdev && test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags)) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
return i;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
|
||||
return (-1);
|
||||
}
|
||||
|
||||
static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct mddev *mddev = mp_bh->mddev;
|
||||
struct mpconf *conf = mddev->private;
|
||||
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
list_add(&mp_bh->retry_list, &conf->retry_list);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
md_wakeup_thread(mddev->thread);
|
||||
}
|
||||
|
||||
/*
|
||||
* multipath_end_bh_io() is called when we have finished servicing a multipathed
|
||||
* operation and are ready to return a success/failure code to the buffer
|
||||
* cache layer.
|
||||
*/
|
||||
static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status)
|
||||
{
|
||||
struct bio *bio = mp_bh->master_bio;
|
||||
struct mpconf *conf = mp_bh->mddev->private;
|
||||
|
||||
bio->bi_status = status;
|
||||
bio_endio(bio);
|
||||
mempool_free(mp_bh, &conf->pool);
|
||||
}
|
||||
|
||||
static void multipath_end_request(struct bio *bio)
|
||||
{
|
||||
struct multipath_bh *mp_bh = bio->bi_private;
|
||||
struct mpconf *conf = mp_bh->mddev->private;
|
||||
struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev;
|
||||
|
||||
if (!bio->bi_status)
|
||||
multipath_end_bh_io(mp_bh, 0);
|
||||
else if (!(bio->bi_opf & REQ_RAHEAD)) {
|
||||
/*
|
||||
* oops, IO error:
|
||||
*/
|
||||
md_error (mp_bh->mddev, rdev);
|
||||
pr_info("multipath: %pg: rescheduling sector %llu\n",
|
||||
rdev->bdev,
|
||||
(unsigned long long)bio->bi_iter.bi_sector);
|
||||
multipath_reschedule_retry(mp_bh);
|
||||
} else
|
||||
multipath_end_bh_io(mp_bh, bio->bi_status);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
|
||||
static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
struct multipath_bh * mp_bh;
|
||||
struct multipath_info *multipath;
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH)
|
||||
&& md_flush_request(mddev, bio))
|
||||
return true;
|
||||
|
||||
md_account_bio(mddev, &bio);
|
||||
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
|
||||
|
||||
mp_bh->master_bio = bio;
|
||||
mp_bh->mddev = mddev;
|
||||
|
||||
mp_bh->path = multipath_map(conf);
|
||||
if (mp_bh->path < 0) {
|
||||
bio_io_error(bio);
|
||||
mempool_free(mp_bh, &conf->pool);
|
||||
return true;
|
||||
}
|
||||
multipath = conf->multipaths + mp_bh->path;
|
||||
|
||||
bio_init_clone(multipath->rdev->bdev, &mp_bh->bio, bio, GFP_NOIO);
|
||||
|
||||
mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset;
|
||||
mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT;
|
||||
mp_bh->bio.bi_end_io = multipath_end_request;
|
||||
mp_bh->bio.bi_private = mp_bh;
|
||||
mddev_check_write_zeroes(mddev, &mp_bh->bio);
|
||||
submit_bio_noacct(&mp_bh->bio);
|
||||
return true;
|
||||
}
|
||||
|
||||
static void multipath_status(struct seq_file *seq, struct mddev *mddev)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
int i;
|
||||
|
||||
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
|
||||
conf->raid_disks - mddev->degraded);
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
|
||||
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_putc(seq, ']');
|
||||
}
|
||||
|
||||
/*
|
||||
* Careful, this can execute in IRQ contexts as well!
|
||||
*/
|
||||
static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
|
||||
if (conf->raid_disks - mddev->degraded <= 1) {
|
||||
/*
|
||||
* Uh oh, we can do nothing if this is our last path, but
|
||||
* first check if this is a queued request for a device
|
||||
* which has just failed.
|
||||
*/
|
||||
pr_warn("multipath: only one IO path left and IO error.\n");
|
||||
/* leave it active... it's all we have */
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Mark disk as unusable
|
||||
*/
|
||||
if (test_and_clear_bit(In_sync, &rdev->flags)) {
|
||||
unsigned long flags;
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
mddev->degraded++;
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
}
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
|
||||
pr_err("multipath: IO failure on %pg, disabling IO path.\n"
|
||||
"multipath: Operation continuing on %d IO paths.\n",
|
||||
rdev->bdev,
|
||||
conf->raid_disks - mddev->degraded);
|
||||
}
|
||||
|
||||
static void print_multipath_conf (struct mpconf *conf)
|
||||
{
|
||||
int i;
|
||||
struct multipath_info *tmp;
|
||||
|
||||
pr_debug("MULTIPATH conf printout:\n");
|
||||
if (!conf) {
|
||||
pr_debug("(conf==NULL)\n");
|
||||
return;
|
||||
}
|
||||
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
|
||||
conf->raid_disks);
|
||||
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
tmp = conf->multipaths + i;
|
||||
if (tmp->rdev)
|
||||
pr_debug(" disk%d, o:%d, dev:%pg\n",
|
||||
i,!test_bit(Faulty, &tmp->rdev->flags),
|
||||
tmp->rdev->bdev);
|
||||
}
|
||||
}
|
||||
|
||||
static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
int err = -EEXIST;
|
||||
int path;
|
||||
struct multipath_info *p;
|
||||
int first = 0;
|
||||
int last = mddev->raid_disks - 1;
|
||||
|
||||
if (rdev->raid_disk >= 0)
|
||||
first = last = rdev->raid_disk;
|
||||
|
||||
print_multipath_conf(conf);
|
||||
|
||||
for (path = first; path <= last; path++)
|
||||
if ((p=conf->multipaths+path)->rdev == NULL) {
|
||||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
|
||||
err = md_integrity_add_rdev(rdev, mddev);
|
||||
if (err)
|
||||
break;
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
mddev->degraded--;
|
||||
rdev->raid_disk = path;
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
rcu_assign_pointer(p->rdev, rdev);
|
||||
err = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
print_multipath_conf(conf);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct mpconf *conf = mddev->private;
|
||||
int err = 0;
|
||||
int number = rdev->raid_disk;
|
||||
struct multipath_info *p = conf->multipaths + number;
|
||||
|
||||
print_multipath_conf(conf);
|
||||
|
||||
if (rdev == p->rdev) {
|
||||
if (test_bit(In_sync, &rdev->flags) ||
|
||||
atomic_read(&rdev->nr_pending)) {
|
||||
pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
p->rdev = NULL;
|
||||
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
|
||||
synchronize_rcu();
|
||||
if (atomic_read(&rdev->nr_pending)) {
|
||||
/* lost the race, try later */
|
||||
err = -EBUSY;
|
||||
p->rdev = rdev;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
err = md_integrity_register(mddev);
|
||||
}
|
||||
abort:
|
||||
|
||||
print_multipath_conf(conf);
|
||||
return err;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a kernel thread which:
|
||||
*
|
||||
* 1. Retries failed read operations on working multipaths.
|
||||
* 2. Updates the raid superblock when problems encounter.
|
||||
* 3. Performs writes following reads for array syncronising.
|
||||
*/
|
||||
|
||||
static void multipathd(struct md_thread *thread)
|
||||
{
|
||||
struct mddev *mddev = thread->mddev;
|
||||
struct multipath_bh *mp_bh;
|
||||
struct bio *bio;
|
||||
unsigned long flags;
|
||||
struct mpconf *conf = mddev->private;
|
||||
struct list_head *head = &conf->retry_list;
|
||||
|
||||
md_check_recovery(mddev);
|
||||
for (;;) {
|
||||
spin_lock_irqsave(&conf->device_lock, flags);
|
||||
if (list_empty(head))
|
||||
break;
|
||||
mp_bh = list_entry(head->prev, struct multipath_bh, retry_list);
|
||||
list_del(head->prev);
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
|
||||
bio = &mp_bh->bio;
|
||||
bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
|
||||
|
||||
if ((mp_bh->path = multipath_map (conf))<0) {
|
||||
pr_err("multipath: %pg: unrecoverable IO read error for block %llu\n",
|
||||
bio->bi_bdev,
|
||||
(unsigned long long)bio->bi_iter.bi_sector);
|
||||
multipath_end_bh_io(mp_bh, BLK_STS_IOERR);
|
||||
} else {
|
||||
pr_err("multipath: %pg: redirecting sector %llu to another IO path\n",
|
||||
bio->bi_bdev,
|
||||
(unsigned long long)bio->bi_iter.bi_sector);
|
||||
*bio = *(mp_bh->master_bio);
|
||||
bio->bi_iter.bi_sector +=
|
||||
conf->multipaths[mp_bh->path].rdev->data_offset;
|
||||
bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev);
|
||||
bio->bi_opf |= REQ_FAILFAST_TRANSPORT;
|
||||
bio->bi_end_io = multipath_end_request;
|
||||
bio->bi_private = mp_bh;
|
||||
submit_bio_noacct(bio);
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&conf->device_lock, flags);
|
||||
}
|
||||
|
||||
static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks)
|
||||
{
|
||||
WARN_ONCE(sectors || raid_disks,
|
||||
"%s does not support generic reshape\n", __func__);
|
||||
|
||||
return mddev->dev_sectors;
|
||||
}
|
||||
|
||||
static int multipath_run (struct mddev *mddev)
|
||||
{
|
||||
struct mpconf *conf;
|
||||
int disk_idx;
|
||||
struct multipath_info *disk;
|
||||
struct md_rdev *rdev;
|
||||
int working_disks;
|
||||
int ret;
|
||||
|
||||
if (md_check_no_bitmap(mddev))
|
||||
return -EINVAL;
|
||||
|
||||
if (mddev->level != LEVEL_MULTIPATH) {
|
||||
pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
|
||||
mdname(mddev), mddev->level);
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* copy the already verified devices into our private MULTIPATH
|
||||
* bookkeeping area. [whatever we allocate in multipath_run(),
|
||||
* should be freed in multipath_free()]
|
||||
*/
|
||||
|
||||
conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
|
||||
mddev->private = conf;
|
||||
if (!conf)
|
||||
goto out;
|
||||
|
||||
conf->multipaths = kcalloc(mddev->raid_disks,
|
||||
sizeof(struct multipath_info),
|
||||
GFP_KERNEL);
|
||||
if (!conf->multipaths)
|
||||
goto out_free_conf;
|
||||
|
||||
working_disks = 0;
|
||||
rdev_for_each(rdev, mddev) {
|
||||
disk_idx = rdev->raid_disk;
|
||||
if (disk_idx < 0 ||
|
||||
disk_idx >= mddev->raid_disks)
|
||||
continue;
|
||||
|
||||
disk = conf->multipaths + disk_idx;
|
||||
disk->rdev = rdev;
|
||||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
|
||||
if (!test_bit(Faulty, &rdev->flags))
|
||||
working_disks++;
|
||||
}
|
||||
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
conf->mddev = mddev;
|
||||
spin_lock_init(&conf->device_lock);
|
||||
INIT_LIST_HEAD(&conf->retry_list);
|
||||
|
||||
if (!working_disks) {
|
||||
pr_warn("multipath: no operational IO paths for %s\n",
|
||||
mdname(mddev));
|
||||
goto out_free_conf;
|
||||
}
|
||||
mddev->degraded = conf->raid_disks - working_disks;
|
||||
|
||||
ret = mempool_init_kmalloc_pool(&conf->pool, NR_RESERVED_BUFS,
|
||||
sizeof(struct multipath_bh));
|
||||
if (ret)
|
||||
goto out_free_conf;
|
||||
|
||||
rcu_assign_pointer(mddev->thread,
|
||||
md_register_thread(multipathd, mddev, "multipath"));
|
||||
if (!mddev->thread)
|
||||
goto out_free_conf;
|
||||
|
||||
pr_info("multipath: array %s active with %d out of %d IO paths\n",
|
||||
mdname(mddev), conf->raid_disks - mddev->degraded,
|
||||
mddev->raid_disks);
|
||||
/*
|
||||
* Ok, everything is just fine now
|
||||
*/
|
||||
md_set_array_sectors(mddev, multipath_size(mddev, 0, 0));
|
||||
|
||||
if (md_integrity_register(mddev))
|
||||
goto out_free_conf;
|
||||
|
||||
return 0;
|
||||
|
||||
out_free_conf:
|
||||
mempool_exit(&conf->pool);
|
||||
kfree(conf->multipaths);
|
||||
kfree(conf);
|
||||
mddev->private = NULL;
|
||||
out:
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static void multipath_free(struct mddev *mddev, void *priv)
|
||||
{
|
||||
struct mpconf *conf = priv;
|
||||
|
||||
mempool_exit(&conf->pool);
|
||||
kfree(conf->multipaths);
|
||||
kfree(conf);
|
||||
}
|
||||
|
||||
static struct md_personality multipath_personality =
|
||||
{
|
||||
.name = "multipath",
|
||||
.level = LEVEL_MULTIPATH,
|
||||
.owner = THIS_MODULE,
|
||||
.make_request = multipath_make_request,
|
||||
.run = multipath_run,
|
||||
.free = multipath_free,
|
||||
.status = multipath_status,
|
||||
.error_handler = multipath_error,
|
||||
.hot_add_disk = multipath_add_disk,
|
||||
.hot_remove_disk= multipath_remove_disk,
|
||||
.size = multipath_size,
|
||||
};
|
||||
|
||||
static int __init multipath_init (void)
|
||||
{
|
||||
return register_md_personality (&multipath_personality);
|
||||
}
|
||||
|
||||
static void __exit multipath_exit (void)
|
||||
{
|
||||
unregister_md_personality (&multipath_personality);
|
||||
}
|
||||
|
||||
module_init(multipath_init);
|
||||
module_exit(multipath_exit);
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)");
|
||||
MODULE_ALIAS("md-personality-7"); /* MULTIPATH */
|
||||
MODULE_ALIAS("md-multipath");
|
||||
MODULE_ALIAS("md-level--4");
|
305
drivers/md/md.c
305
drivers/md/md.c
@ -543,6 +543,9 @@ static void md_end_flush(struct bio *bio)
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
if (atomic_dec_and_test(&mddev->flush_pending)) {
|
||||
/* The pair is percpu_ref_get() from md_flush_request() */
|
||||
percpu_ref_put(&mddev->active_io);
|
||||
|
||||
/* The pre-request flush has finished */
|
||||
queue_work(md_wq, &mddev->flush_work);
|
||||
}
|
||||
@ -562,12 +565,8 @@ static void submit_flushes(struct work_struct *ws)
|
||||
rdev_for_each_rcu(rdev, mddev)
|
||||
if (rdev->raid_disk >= 0 &&
|
||||
!test_bit(Faulty, &rdev->flags)) {
|
||||
/* Take two references, one is dropped
|
||||
* when request finishes, one after
|
||||
* we reclaim rcu_read_lock
|
||||
*/
|
||||
struct bio *bi;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
bi = bio_alloc_bioset(rdev->bdev, 0,
|
||||
@ -578,7 +577,6 @@ static void submit_flushes(struct work_struct *ws)
|
||||
atomic_inc(&mddev->flush_pending);
|
||||
submit_bio(bi);
|
||||
rcu_read_lock();
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (atomic_dec_and_test(&mddev->flush_pending))
|
||||
@ -631,6 +629,18 @@ bool md_flush_request(struct mddev *mddev, struct bio *bio)
|
||||
/* new request after previous flush is completed */
|
||||
if (ktime_after(req_start, mddev->prev_flush_start)) {
|
||||
WARN_ON(mddev->flush_bio);
|
||||
/*
|
||||
* Grab a reference to make sure mddev_suspend() will wait for
|
||||
* this flush to be done.
|
||||
*
|
||||
* md_flush_reqeust() is called under md_handle_request() and
|
||||
* 'active_io' is already grabbed, hence percpu_ref_is_zero()
|
||||
* won't pass, percpu_ref_tryget_live() can't be used because
|
||||
* percpu_ref_kill() can be called by mddev_suspend()
|
||||
* concurrently.
|
||||
*/
|
||||
WARN_ON(percpu_ref_is_zero(&mddev->active_io));
|
||||
percpu_ref_get(&mddev->active_io);
|
||||
mddev->flush_bio = bio;
|
||||
bio = NULL;
|
||||
}
|
||||
@ -1027,9 +1037,10 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
||||
return;
|
||||
|
||||
bio = bio_alloc_bioset(rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev,
|
||||
1,
|
||||
REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA,
|
||||
GFP_NOIO, &mddev->sync_set);
|
||||
1,
|
||||
REQ_OP_WRITE | REQ_SYNC | REQ_IDLE | REQ_META
|
||||
| REQ_PREFLUSH | REQ_FUA,
|
||||
GFP_NOIO, &mddev->sync_set);
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
|
||||
@ -1209,6 +1220,7 @@ struct super_type {
|
||||
struct md_rdev *refdev,
|
||||
int minor_version);
|
||||
int (*validate_super)(struct mddev *mddev,
|
||||
struct md_rdev *freshest,
|
||||
struct md_rdev *rdev);
|
||||
void (*sync_super)(struct mddev *mddev,
|
||||
struct md_rdev *rdev);
|
||||
@ -1289,17 +1301,11 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
|
||||
rdev->sb_size = MD_SB_BYTES;
|
||||
rdev->badblocks.shift = -1;
|
||||
|
||||
if (sb->level == LEVEL_MULTIPATH)
|
||||
rdev->desc_nr = -1;
|
||||
else
|
||||
rdev->desc_nr = sb->this_disk.number;
|
||||
rdev->desc_nr = sb->this_disk.number;
|
||||
|
||||
/* not spare disk, or LEVEL_MULTIPATH */
|
||||
if (sb->level == LEVEL_MULTIPATH ||
|
||||
(rdev->desc_nr >= 0 &&
|
||||
rdev->desc_nr < MD_SB_DISKS &&
|
||||
sb->disks[rdev->desc_nr].state &
|
||||
((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
|
||||
/* not spare disk */
|
||||
if (rdev->desc_nr >= 0 && rdev->desc_nr < MD_SB_DISKS &&
|
||||
sb->disks[rdev->desc_nr].state & ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
|
||||
spare_disk = false;
|
||||
|
||||
if (!refdev) {
|
||||
@ -1346,8 +1352,9 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
|
||||
|
||||
/*
|
||||
* validate_super for 0.90.0
|
||||
* note: we are not using "freshest" for 0.9 superblock
|
||||
*/
|
||||
static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
static int super_90_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
|
||||
{
|
||||
mdp_disk_t *desc;
|
||||
mdp_super_t *sb = page_address(rdev->sb_page);
|
||||
@ -1445,31 +1452,28 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (mddev->level != LEVEL_MULTIPATH) {
|
||||
desc = sb->disks + rdev->desc_nr;
|
||||
desc = sb->disks + rdev->desc_nr;
|
||||
|
||||
if (desc->state & (1<<MD_DISK_FAULTY))
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
else if (desc->state & (1<<MD_DISK_SYNC) /* &&
|
||||
desc->raid_disk < mddev->raid_disks */) {
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
rdev->raid_disk = desc->raid_disk;
|
||||
rdev->saved_raid_disk = desc->raid_disk;
|
||||
} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
|
||||
/* active but not in sync implies recovery up to
|
||||
* reshape position. We don't know exactly where
|
||||
* that is, so set to zero for now */
|
||||
if (mddev->minor_version >= 91) {
|
||||
rdev->recovery_offset = 0;
|
||||
rdev->raid_disk = desc->raid_disk;
|
||||
}
|
||||
}
|
||||
if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
if (desc->state & (1<<MD_DISK_FAILFAST))
|
||||
set_bit(FailFast, &rdev->flags);
|
||||
} else /* MULTIPATH are always insync */
|
||||
if (desc->state & (1<<MD_DISK_FAULTY))
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
else if (desc->state & (1<<MD_DISK_SYNC)) {
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
rdev->raid_disk = desc->raid_disk;
|
||||
rdev->saved_raid_disk = desc->raid_disk;
|
||||
} else if (desc->state & (1<<MD_DISK_ACTIVE)) {
|
||||
/* active but not in sync implies recovery up to
|
||||
* reshape position. We don't know exactly where
|
||||
* that is, so set to zero for now
|
||||
*/
|
||||
if (mddev->minor_version >= 91) {
|
||||
rdev->recovery_offset = 0;
|
||||
rdev->raid_disk = desc->raid_disk;
|
||||
}
|
||||
}
|
||||
if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
if (desc->state & (1<<MD_DISK_FAILFAST))
|
||||
set_bit(FailFast, &rdev->flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1759,10 +1763,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
|
||||
&& rdev->new_data_offset < sb_start + (rdev->sb_size/512))
|
||||
return -EINVAL;
|
||||
|
||||
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
|
||||
rdev->desc_nr = -1;
|
||||
else
|
||||
rdev->desc_nr = le32_to_cpu(sb->dev_number);
|
||||
rdev->desc_nr = le32_to_cpu(sb->dev_number);
|
||||
|
||||
if (!rdev->bb_page) {
|
||||
rdev->bb_page = alloc_page(GFP_KERNEL);
|
||||
@ -1815,12 +1816,10 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
|
||||
sb->level != 0)
|
||||
return -EINVAL;
|
||||
|
||||
/* not spare disk, or LEVEL_MULTIPATH */
|
||||
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
|
||||
(rdev->desc_nr >= 0 &&
|
||||
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
|
||||
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
|
||||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
|
||||
/* not spare disk */
|
||||
if (rdev->desc_nr >= 0 && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
|
||||
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
|
||||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
|
||||
spare_disk = false;
|
||||
|
||||
if (!refdev) {
|
||||
@ -1859,10 +1858,11 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
static int super_1_validate(struct mddev *mddev, struct md_rdev *freshest, struct md_rdev *rdev)
|
||||
{
|
||||
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
|
||||
__u64 ev1 = le64_to_cpu(sb->events);
|
||||
int role;
|
||||
|
||||
rdev->raid_disk = -1;
|
||||
clear_bit(Faulty, &rdev->flags);
|
||||
@ -1955,13 +1955,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
}
|
||||
} else if (mddev->pers == NULL) {
|
||||
/* Insist of good event counter while assembling, except for
|
||||
* spares (which don't need an event count) */
|
||||
++ev1;
|
||||
* spares (which don't need an event count).
|
||||
* Similar to mdadm, we allow event counter difference of 1
|
||||
* from the freshest device.
|
||||
*/
|
||||
if (rdev->desc_nr >= 0 &&
|
||||
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
|
||||
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
|
||||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
|
||||
if (ev1 < mddev->events)
|
||||
if (ev1 + 1 < mddev->events)
|
||||
return -EINVAL;
|
||||
} else if (mddev->bitmap) {
|
||||
/* If adding to array with a bitmap, then we can accept an
|
||||
@ -1976,58 +1978,85 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
|
||||
/* just a hot-add of a new device, leave raid_disk at -1 */
|
||||
return 0;
|
||||
}
|
||||
if (mddev->level != LEVEL_MULTIPATH) {
|
||||
int role;
|
||||
if (rdev->desc_nr < 0 ||
|
||||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
|
||||
role = MD_DISK_ROLE_SPARE;
|
||||
rdev->desc_nr = -1;
|
||||
} else
|
||||
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
|
||||
switch(role) {
|
||||
case MD_DISK_ROLE_SPARE: /* spare */
|
||||
break;
|
||||
case MD_DISK_ROLE_FAULTY: /* faulty */
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
break;
|
||||
case MD_DISK_ROLE_JOURNAL: /* journal device */
|
||||
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
|
||||
/* journal device without journal feature */
|
||||
pr_warn("md: journal device provided without journal feature, ignoring the device\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
set_bit(Journal, &rdev->flags);
|
||||
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
|
||||
rdev->raid_disk = 0;
|
||||
break;
|
||||
default:
|
||||
rdev->saved_raid_disk = role;
|
||||
if ((le32_to_cpu(sb->feature_map) &
|
||||
MD_FEATURE_RECOVERY_OFFSET)) {
|
||||
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
|
||||
if (!(le32_to_cpu(sb->feature_map) &
|
||||
MD_FEATURE_RECOVERY_BITMAP))
|
||||
rdev->saved_raid_disk = -1;
|
||||
} else {
|
||||
/*
|
||||
* If the array is FROZEN, then the device can't
|
||||
* be in_sync with rest of array.
|
||||
*/
|
||||
if (!test_bit(MD_RECOVERY_FROZEN,
|
||||
&mddev->recovery))
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
rdev->raid_disk = role;
|
||||
break;
|
||||
|
||||
if (rdev->desc_nr < 0 ||
|
||||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
|
||||
role = MD_DISK_ROLE_SPARE;
|
||||
rdev->desc_nr = -1;
|
||||
} else if (mddev->pers == NULL && freshest && ev1 < mddev->events) {
|
||||
/*
|
||||
* If we are assembling, and our event counter is smaller than the
|
||||
* highest event counter, we cannot trust our superblock about the role.
|
||||
* It could happen that our rdev was marked as Faulty, and all other
|
||||
* superblocks were updated with +1 event counter.
|
||||
* Then, before the next superblock update, which typically happens when
|
||||
* remove_and_add_spares() removes the device from the array, there was
|
||||
* a crash or reboot.
|
||||
* If we allow current rdev without consulting the freshest superblock,
|
||||
* we could cause data corruption.
|
||||
* Note that in this case our event counter is smaller by 1 than the
|
||||
* highest, otherwise, this rdev would not be allowed into array;
|
||||
* both kernel and mdadm allow event counter difference of 1.
|
||||
*/
|
||||
struct mdp_superblock_1 *freshest_sb = page_address(freshest->sb_page);
|
||||
u32 freshest_max_dev = le32_to_cpu(freshest_sb->max_dev);
|
||||
|
||||
if (rdev->desc_nr >= freshest_max_dev) {
|
||||
/* this is unexpected, better not proceed */
|
||||
pr_warn("md: %s: rdev[%pg]: desc_nr(%d) >= freshest(%pg)->sb->max_dev(%u)\n",
|
||||
mdname(mddev), rdev->bdev, rdev->desc_nr,
|
||||
freshest->bdev, freshest_max_dev);
|
||||
return -EUCLEAN;
|
||||
}
|
||||
if (sb->devflags & WriteMostly1)
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
if (sb->devflags & FailFast1)
|
||||
set_bit(FailFast, &rdev->flags);
|
||||
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
|
||||
set_bit(Replacement, &rdev->flags);
|
||||
} else /* MULTIPATH are always insync */
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
|
||||
role = le16_to_cpu(freshest_sb->dev_roles[rdev->desc_nr]);
|
||||
pr_debug("md: %s: rdev[%pg]: role=%d(0x%x) according to freshest %pg\n",
|
||||
mdname(mddev), rdev->bdev, role, role, freshest->bdev);
|
||||
} else {
|
||||
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
|
||||
}
|
||||
switch (role) {
|
||||
case MD_DISK_ROLE_SPARE: /* spare */
|
||||
break;
|
||||
case MD_DISK_ROLE_FAULTY: /* faulty */
|
||||
set_bit(Faulty, &rdev->flags);
|
||||
break;
|
||||
case MD_DISK_ROLE_JOURNAL: /* journal device */
|
||||
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
|
||||
/* journal device without journal feature */
|
||||
pr_warn("md: journal device provided without journal feature, ignoring the device\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
set_bit(Journal, &rdev->flags);
|
||||
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
|
||||
rdev->raid_disk = 0;
|
||||
break;
|
||||
default:
|
||||
rdev->saved_raid_disk = role;
|
||||
if ((le32_to_cpu(sb->feature_map) &
|
||||
MD_FEATURE_RECOVERY_OFFSET)) {
|
||||
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
|
||||
if (!(le32_to_cpu(sb->feature_map) &
|
||||
MD_FEATURE_RECOVERY_BITMAP))
|
||||
rdev->saved_raid_disk = -1;
|
||||
} else {
|
||||
/*
|
||||
* If the array is FROZEN, then the device can't
|
||||
* be in_sync with rest of array.
|
||||
*/
|
||||
if (!test_bit(MD_RECOVERY_FROZEN,
|
||||
&mddev->recovery))
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
rdev->raid_disk = role;
|
||||
break;
|
||||
}
|
||||
if (sb->devflags & WriteMostly1)
|
||||
set_bit(WriteMostly, &rdev->flags);
|
||||
if (sb->devflags & FailFast1)
|
||||
set_bit(FailFast, &rdev->flags);
|
||||
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
|
||||
set_bit(Replacement, &rdev->flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2845,10 +2874,6 @@ rewrite:
|
||||
} else
|
||||
pr_debug("md: %pg (skipping faulty)\n",
|
||||
rdev->bdev);
|
||||
|
||||
if (mddev->level == LEVEL_MULTIPATH)
|
||||
/* only need to write one superblock... */
|
||||
break;
|
||||
}
|
||||
if (md_super_wait(mddev) < 0)
|
||||
goto rewrite;
|
||||
@ -2890,7 +2915,7 @@ static int add_bound_rdev(struct md_rdev *rdev)
|
||||
* and should be added immediately.
|
||||
*/
|
||||
super_types[mddev->major_version].
|
||||
validate_super(mddev, rdev);
|
||||
validate_super(mddev, NULL/*freshest*/, rdev);
|
||||
err = mddev->pers->hot_add_disk(mddev, rdev);
|
||||
if (err) {
|
||||
md_kick_rdev_from_array(rdev);
|
||||
@ -3827,7 +3852,7 @@ static int analyze_sbs(struct mddev *mddev)
|
||||
}
|
||||
|
||||
super_types[mddev->major_version].
|
||||
validate_super(mddev, freshest);
|
||||
validate_super(mddev, NULL/*freshest*/, freshest);
|
||||
|
||||
i = 0;
|
||||
rdev_for_each_safe(rdev, tmp, mddev) {
|
||||
@ -3842,20 +3867,15 @@ static int analyze_sbs(struct mddev *mddev)
|
||||
}
|
||||
if (rdev != freshest) {
|
||||
if (super_types[mddev->major_version].
|
||||
validate_super(mddev, rdev)) {
|
||||
validate_super(mddev, freshest, rdev)) {
|
||||
pr_warn("md: kicking non-fresh %pg from array!\n",
|
||||
rdev->bdev);
|
||||
md_kick_rdev_from_array(rdev);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (mddev->level == LEVEL_MULTIPATH) {
|
||||
rdev->desc_nr = i++;
|
||||
rdev->raid_disk = rdev->desc_nr;
|
||||
set_bit(In_sync, &rdev->flags);
|
||||
} else if (rdev->raid_disk >=
|
||||
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
|
||||
!test_bit(Journal, &rdev->flags)) {
|
||||
if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks)) &&
|
||||
!test_bit(Journal, &rdev->flags)) {
|
||||
rdev->raid_disk = -1;
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
@ -6833,7 +6853,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
|
||||
rdev->saved_raid_disk = rdev->raid_disk;
|
||||
} else
|
||||
super_types[mddev->major_version].
|
||||
validate_super(mddev, rdev);
|
||||
validate_super(mddev, NULL/*freshest*/, rdev);
|
||||
if ((info->state & (1<<MD_DISK_SYNC)) &&
|
||||
rdev->raid_disk != info->raid_disk) {
|
||||
/* This was a hot-add request, but events doesn't
|
||||
@ -8076,7 +8096,7 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
|
||||
return;
|
||||
mddev->pers->error_handler(mddev, rdev);
|
||||
|
||||
if (mddev->pers->level == 0 || mddev->pers->level == LEVEL_LINEAR)
|
||||
if (mddev->pers->level == 0)
|
||||
return;
|
||||
|
||||
if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
|
||||
@ -9240,46 +9260,21 @@ static int remove_and_add_spares(struct mddev *mddev,
|
||||
struct md_rdev *rdev;
|
||||
int spares = 0;
|
||||
int removed = 0;
|
||||
bool remove_some = false;
|
||||
|
||||
if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
|
||||
/* Mustn't remove devices when resync thread is running */
|
||||
return 0;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if ((this == NULL || rdev == this) &&
|
||||
rdev->raid_disk >= 0 &&
|
||||
!test_bit(Blocked, &rdev->flags) &&
|
||||
test_bit(Faulty, &rdev->flags) &&
|
||||
atomic_read(&rdev->nr_pending)==0) {
|
||||
/* Faulty non-Blocked devices with nr_pending == 0
|
||||
* never get nr_pending incremented,
|
||||
* never get Faulty cleared, and never get Blocked set.
|
||||
* So we can synchronize_rcu now rather than once per device
|
||||
*/
|
||||
remove_some = true;
|
||||
set_bit(RemoveSynchronized, &rdev->flags);
|
||||
if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
|
||||
!mddev->pers->hot_remove_disk(mddev, rdev)) {
|
||||
sysfs_unlink_rdev(mddev, rdev);
|
||||
rdev->saved_raid_disk = rdev->raid_disk;
|
||||
rdev->raid_disk = -1;
|
||||
removed++;
|
||||
}
|
||||
}
|
||||
|
||||
if (remove_some)
|
||||
synchronize_rcu();
|
||||
rdev_for_each(rdev, mddev) {
|
||||
if ((this == NULL || rdev == this) &&
|
||||
(test_bit(RemoveSynchronized, &rdev->flags) ||
|
||||
rdev_removeable(rdev))) {
|
||||
if (mddev->pers->hot_remove_disk(
|
||||
mddev, rdev) == 0) {
|
||||
sysfs_unlink_rdev(mddev, rdev);
|
||||
rdev->saved_raid_disk = rdev->raid_disk;
|
||||
rdev->raid_disk = -1;
|
||||
removed++;
|
||||
}
|
||||
}
|
||||
if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
|
||||
clear_bit(RemoveSynchronized, &rdev->flags);
|
||||
}
|
||||
|
||||
if (removed && mddev->kobj.sd)
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_degraded);
|
||||
|
||||
|
@ -190,11 +190,6 @@ enum flag_bits {
|
||||
* than other devices in the array
|
||||
*/
|
||||
ClusterRemove,
|
||||
RemoveSynchronized, /* synchronize_rcu() was called after
|
||||
* this device was known to be faulty,
|
||||
* so it is safe to remove without
|
||||
* another synchronize_rcu() call.
|
||||
*/
|
||||
ExternalBbl, /* External metadata provides bad
|
||||
* block management for a disk
|
||||
*/
|
||||
|
@ -173,3 +173,57 @@ static inline void raid1_prepare_flush_writes(struct bitmap *bitmap)
|
||||
else
|
||||
md_bitmap_unplug(bitmap);
|
||||
}
|
||||
|
||||
/*
|
||||
* Used by fix_read_error() to decay the per rdev read_errors.
|
||||
* We halve the read error count for every hour that has elapsed
|
||||
* since the last recorded read error.
|
||||
*/
|
||||
static inline void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
long cur_time_mon;
|
||||
unsigned long hours_since_last;
|
||||
unsigned int read_errors = atomic_read(&rdev->read_errors);
|
||||
|
||||
cur_time_mon = ktime_get_seconds();
|
||||
|
||||
if (rdev->last_read_error == 0) {
|
||||
/* first time we've seen a read error */
|
||||
rdev->last_read_error = cur_time_mon;
|
||||
return;
|
||||
}
|
||||
|
||||
hours_since_last = (long)(cur_time_mon -
|
||||
rdev->last_read_error) / 3600;
|
||||
|
||||
rdev->last_read_error = cur_time_mon;
|
||||
|
||||
/*
|
||||
* if hours_since_last is > the number of bits in read_errors
|
||||
* just set read errors to 0. We do this to avoid
|
||||
* overflowing the shift of read_errors by hours_since_last.
|
||||
*/
|
||||
if (hours_since_last >= 8 * sizeof(read_errors))
|
||||
atomic_set(&rdev->read_errors, 0);
|
||||
else
|
||||
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
|
||||
}
|
||||
|
||||
static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
|
||||
int read_errors;
|
||||
|
||||
check_decay_read_errors(mddev, rdev);
|
||||
read_errors = atomic_inc_return(&rdev->read_errors);
|
||||
if (read_errors > max_read_errors) {
|
||||
pr_notice("md/"RAID_1_10_NAME":%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
|
||||
mdname(mddev), rdev->bdev, read_errors, max_read_errors);
|
||||
pr_notice("md/"RAID_1_10_NAME":%s: %pg: Failing raid device\n",
|
||||
mdname(mddev), rdev->bdev);
|
||||
md_error(mddev, rdev);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -49,6 +49,7 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
|
||||
#define raid1_log(md, fmt, args...) \
|
||||
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
|
||||
|
||||
#define RAID_1_10_NAME "raid1"
|
||||
#include "raid1-10.c"
|
||||
|
||||
#define START(node) ((node)->start)
|
||||
@ -609,7 +610,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
|
||||
int choose_first;
|
||||
int choose_next_idle;
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* Check if we can balance. We can balance on the whole
|
||||
* device if no resync is going on, or below the resync window.
|
||||
@ -642,7 +642,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
|
||||
unsigned int pending;
|
||||
bool nonrot;
|
||||
|
||||
rdev = rcu_dereference(conf->mirrors[disk].rdev);
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED
|
||||
|| rdev == NULL
|
||||
|| test_bit(Faulty, &rdev->flags))
|
||||
@ -773,7 +773,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
|
||||
}
|
||||
|
||||
if (best_disk >= 0) {
|
||||
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
|
||||
rdev = conf->mirrors[best_disk].rdev;
|
||||
if (!rdev)
|
||||
goto retry;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
@ -784,7 +784,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
|
||||
|
||||
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
*max_sectors = sectors;
|
||||
|
||||
return best_disk;
|
||||
@ -1126,8 +1125,6 @@ static void alloc_behind_master_bio(struct r1bio *r1_bio,
|
||||
|
||||
behind_bio = bio_alloc_bioset(NULL, vcnt, 0, GFP_NOIO,
|
||||
&r1_bio->mddev->bio_set);
|
||||
if (!behind_bio)
|
||||
return;
|
||||
|
||||
/* discard op, we don't support writezero/writesame yet */
|
||||
if (!bio_has_data(bio)) {
|
||||
@ -1235,14 +1232,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
|
||||
|
||||
if (r1bio_existed) {
|
||||
/* Need to get the block device name carefully */
|
||||
struct md_rdev *rdev;
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
|
||||
|
||||
if (rdev)
|
||||
snprintf(b, sizeof(b), "%pg", rdev->bdev);
|
||||
else
|
||||
strcpy(b, "???");
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1396,10 +1391,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
|
||||
disks = conf->raid_disks * 2;
|
||||
blocked_rdev = NULL;
|
||||
rcu_read_lock();
|
||||
max_sectors = r1_bio->sectors;
|
||||
for (i = 0; i < disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
||||
|
||||
/*
|
||||
* The write-behind io is only attempted on drives marked as
|
||||
@ -1465,7 +1459,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
|
||||
}
|
||||
r1_bio->bios[i] = bio;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(blocked_rdev)) {
|
||||
/* Wait for this device to become unblocked */
|
||||
@ -1617,15 +1610,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
|
||||
struct r1conf *conf = mddev->private;
|
||||
int i;
|
||||
|
||||
lockdep_assert_held(&mddev->lock);
|
||||
|
||||
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
|
||||
conf->raid_disks - mddev->degraded);
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
|
||||
|
||||
seq_printf(seq, "%s",
|
||||
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_printf(seq, "]");
|
||||
}
|
||||
|
||||
@ -1691,16 +1685,15 @@ static void print_conf(struct r1conf *conf)
|
||||
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
|
||||
conf->raid_disks);
|
||||
|
||||
rcu_read_lock();
|
||||
lockdep_assert_held(&conf->mddev->reconfig_mutex);
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
||||
if (rdev)
|
||||
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
|
||||
i, !test_bit(In_sync, &rdev->flags),
|
||||
!test_bit(Faulty, &rdev->flags),
|
||||
rdev->bdev);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void close_sync(struct r1conf *conf)
|
||||
@ -1810,7 +1803,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
*/
|
||||
if (rdev->saved_raid_disk < 0)
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p->rdev, rdev);
|
||||
WRITE_ONCE(p->rdev, rdev);
|
||||
break;
|
||||
}
|
||||
if (test_bit(WantReplacement, &p->rdev->flags) &&
|
||||
@ -1826,7 +1819,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
rdev->raid_disk = repl_slot;
|
||||
err = 0;
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
|
||||
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
|
||||
}
|
||||
|
||||
print_conf(conf);
|
||||
@ -1862,16 +1855,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
p->rdev = NULL;
|
||||
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
|
||||
synchronize_rcu();
|
||||
if (atomic_read(&rdev->nr_pending)) {
|
||||
/* lost the race, try later */
|
||||
err = -EBUSY;
|
||||
p->rdev = rdev;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
WRITE_ONCE(p->rdev, NULL);
|
||||
if (conf->mirrors[conf->raid_disks + number].rdev) {
|
||||
/* We just removed a device that is being replaced.
|
||||
* Move down the replacement. We drain all IO before
|
||||
@ -1892,7 +1876,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
goto abort;
|
||||
}
|
||||
clear_bit(Replacement, &repl->flags);
|
||||
p->rdev = repl;
|
||||
WRITE_ONCE(p->rdev, repl);
|
||||
conf->mirrors[conf->raid_disks + number].rdev = NULL;
|
||||
unfreeze_array(conf);
|
||||
}
|
||||
@ -2272,16 +2256,24 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
|
||||
* 3. Performs writes following reads for array synchronising.
|
||||
*/
|
||||
|
||||
static void fix_read_error(struct r1conf *conf, int read_disk,
|
||||
sector_t sect, int sectors)
|
||||
static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
{
|
||||
sector_t sect = r1_bio->sector;
|
||||
int sectors = r1_bio->sectors;
|
||||
int read_disk = r1_bio->read_disk;
|
||||
struct mddev *mddev = conf->mddev;
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[read_disk].rdev);
|
||||
|
||||
if (exceed_read_errors(mddev, rdev)) {
|
||||
r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
|
||||
return;
|
||||
}
|
||||
|
||||
while(sectors) {
|
||||
int s = sectors;
|
||||
int d = read_disk;
|
||||
int success = 0;
|
||||
int start;
|
||||
struct md_rdev *rdev;
|
||||
|
||||
if (s > (PAGE_SIZE>>9))
|
||||
s = PAGE_SIZE >> 9;
|
||||
@ -2290,8 +2282,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
(test_bit(In_sync, &rdev->flags) ||
|
||||
(!test_bit(Faulty, &rdev->flags) &&
|
||||
@ -2299,15 +2290,14 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
|
||||
is_badblock(rdev, sect, s,
|
||||
&first_bad, &bad_sectors) == 0) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
if (sync_page_io(rdev, sect, s<<9,
|
||||
conf->tmppage, REQ_OP_READ, false))
|
||||
success = 1;
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
if (success)
|
||||
break;
|
||||
} else
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
d++;
|
||||
if (d == conf->raid_disks * 2)
|
||||
d = 0;
|
||||
@ -2326,29 +2316,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
|
||||
if (d==0)
|
||||
d = conf->raid_disks * 2;
|
||||
d--;
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
!test_bit(Faulty, &rdev->flags)) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
r1_sync_page_io(rdev, sect, s,
|
||||
conf->tmppage, WRITE);
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
} else
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
d = start;
|
||||
while (d != read_disk) {
|
||||
if (d==0)
|
||||
d = conf->raid_disks * 2;
|
||||
d--;
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
!test_bit(Faulty, &rdev->flags)) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
if (r1_sync_page_io(rdev, sect, s,
|
||||
conf->tmppage, READ)) {
|
||||
atomic_add(s, &rdev->corrected_errors);
|
||||
@ -2359,8 +2344,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
|
||||
rdev->bdev);
|
||||
}
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
} else
|
||||
rcu_read_unlock();
|
||||
}
|
||||
}
|
||||
sectors -= s;
|
||||
sect += s;
|
||||
@ -2530,8 +2514,7 @@ static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
if (mddev->ro == 0
|
||||
&& !test_bit(FailFast, &rdev->flags)) {
|
||||
freeze_array(conf, 1);
|
||||
fix_read_error(conf, r1_bio->read_disk,
|
||||
r1_bio->sector, r1_bio->sectors);
|
||||
fix_read_error(conf, r1_bio);
|
||||
unfreeze_array(conf);
|
||||
} else if (mddev->ro == 0 && test_bit(FailFast, &rdev->flags)) {
|
||||
md_error(mddev, rdev);
|
||||
@ -2741,7 +2724,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
|
||||
r1_bio = raid1_alloc_init_r1buf(conf);
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* If we get a correctably read error during resync or recovery,
|
||||
* we might want to read from a different device. So we
|
||||
@ -2762,7 +2744,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
struct md_rdev *rdev;
|
||||
bio = r1_bio->bios[i];
|
||||
|
||||
rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
rdev = conf->mirrors[i].rdev;
|
||||
if (rdev == NULL ||
|
||||
test_bit(Faulty, &rdev->flags)) {
|
||||
if (i < conf->raid_disks)
|
||||
@ -2820,7 +2802,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_opf |= MD_FAILFAST;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (disk < 0)
|
||||
disk = wonly;
|
||||
r1_bio->read_disk = disk;
|
||||
|
@ -19,6 +19,8 @@
|
||||
#include <linux/raid/md_p.h>
|
||||
#include <trace/events/block.h>
|
||||
#include "md.h"
|
||||
|
||||
#define RAID_1_10_NAME "raid10"
|
||||
#include "raid10.h"
|
||||
#include "raid0.h"
|
||||
#include "md-bitmap.h"
|
||||
@ -743,7 +745,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
|
||||
struct geom *geo = &conf->geo;
|
||||
|
||||
raid10_find_phys(conf, r10_bio);
|
||||
rcu_read_lock();
|
||||
best_dist_slot = -1;
|
||||
min_pending = UINT_MAX;
|
||||
best_dist_rdev = NULL;
|
||||
@ -775,18 +776,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
|
||||
if (r10_bio->devs[slot].bio == IO_BLOCKED)
|
||||
continue;
|
||||
disk = r10_bio->devs[slot].devnum;
|
||||
rdev = rcu_dereference(conf->mirrors[disk].replacement);
|
||||
rdev = conf->mirrors[disk].replacement;
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
|
||||
r10_bio->devs[slot].addr + sectors >
|
||||
rdev->recovery_offset) {
|
||||
/*
|
||||
* Read replacement first to prevent reading both rdev
|
||||
* and replacement as NULL during replacement replace
|
||||
* rdev.
|
||||
*/
|
||||
smp_mb();
|
||||
rdev = rcu_dereference(conf->mirrors[disk].rdev);
|
||||
}
|
||||
rdev->recovery_offset)
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (rdev == NULL ||
|
||||
test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
@ -876,7 +870,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
|
||||
r10_bio->read_slot = slot;
|
||||
} else
|
||||
rdev = NULL;
|
||||
rcu_read_unlock();
|
||||
*max_sectors = best_good_sectors;
|
||||
|
||||
return rdev;
|
||||
@ -1198,9 +1191,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
|
||||
*/
|
||||
gfp = GFP_NOIO | __GFP_HIGH;
|
||||
|
||||
rcu_read_lock();
|
||||
disk = r10_bio->devs[slot].devnum;
|
||||
err_rdev = rcu_dereference(conf->mirrors[disk].rdev);
|
||||
err_rdev = conf->mirrors[disk].rdev;
|
||||
if (err_rdev)
|
||||
snprintf(b, sizeof(b), "%pg", err_rdev->bdev);
|
||||
else {
|
||||
@ -1208,7 +1200,6 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
|
||||
/* This never gets dereferenced */
|
||||
err_rdev = r10_bio->devs[slot].rdev;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors))
|
||||
@ -1279,15 +1270,8 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
|
||||
int devnum = r10_bio->devs[n_copy].devnum;
|
||||
struct bio *mbio;
|
||||
|
||||
if (replacement) {
|
||||
rdev = conf->mirrors[devnum].replacement;
|
||||
if (rdev == NULL) {
|
||||
/* Replacement just got moved to main 'rdev' */
|
||||
smp_mb();
|
||||
rdev = conf->mirrors[devnum].rdev;
|
||||
}
|
||||
} else
|
||||
rdev = conf->mirrors[devnum].rdev;
|
||||
rdev = replacement ? conf->mirrors[devnum].replacement :
|
||||
conf->mirrors[devnum].rdev;
|
||||
|
||||
mbio = bio_alloc_clone(rdev->bdev, bio, GFP_NOIO, &mddev->bio_set);
|
||||
if (replacement)
|
||||
@ -1321,25 +1305,6 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
|
||||
}
|
||||
}
|
||||
|
||||
static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
|
||||
struct md_rdev **prrdev)
|
||||
{
|
||||
struct md_rdev *rdev, *rrdev;
|
||||
|
||||
rrdev = rcu_dereference(mirror->replacement);
|
||||
/*
|
||||
* Read replacement first to prevent reading both rdev and
|
||||
* replacement as NULL during replacement replace rdev.
|
||||
*/
|
||||
smp_mb();
|
||||
rdev = rcu_dereference(mirror->rdev);
|
||||
if (rdev == rrdev)
|
||||
rrdev = NULL;
|
||||
|
||||
*prrdev = rrdev;
|
||||
return rdev;
|
||||
}
|
||||
|
||||
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
{
|
||||
int i;
|
||||
@ -1348,11 +1313,11 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
|
||||
retry_wait:
|
||||
blocked_rdev = NULL;
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->copies; i++) {
|
||||
struct md_rdev *rdev, *rrdev;
|
||||
|
||||
rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
|
||||
rdev = conf->mirrors[i].rdev;
|
||||
rrdev = conf->mirrors[i].replacement;
|
||||
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
blocked_rdev = rdev;
|
||||
@ -1391,7 +1356,6 @@ retry_wait:
|
||||
}
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(blocked_rdev)) {
|
||||
/* Have to wait for this device to get unblocked, then retry */
|
||||
@ -1474,14 +1438,14 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
|
||||
wait_blocked_dev(mddev, r10_bio);
|
||||
|
||||
rcu_read_lock();
|
||||
max_sectors = r10_bio->sectors;
|
||||
|
||||
for (i = 0; i < conf->copies; i++) {
|
||||
int d = r10_bio->devs[i].devnum;
|
||||
struct md_rdev *rdev, *rrdev;
|
||||
|
||||
rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
rrdev = conf->mirrors[d].replacement;
|
||||
if (rdev && (test_bit(Faulty, &rdev->flags)))
|
||||
rdev = NULL;
|
||||
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
|
||||
@ -1535,7 +1499,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (max_sectors < r10_bio->sectors)
|
||||
r10_bio->sectors = max_sectors;
|
||||
@ -1625,17 +1588,8 @@ static void raid10_end_discard_request(struct bio *bio)
|
||||
set_bit(R10BIO_Uptodate, &r10_bio->state);
|
||||
|
||||
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
|
||||
if (repl)
|
||||
rdev = conf->mirrors[dev].replacement;
|
||||
if (!rdev) {
|
||||
/*
|
||||
* raid10_remove_disk uses smp_mb to make sure rdev is set to
|
||||
* replacement before setting replacement to NULL. It can read
|
||||
* rdev first without barrier protect even replacement is NULL
|
||||
*/
|
||||
smp_rmb();
|
||||
rdev = conf->mirrors[dev].rdev;
|
||||
}
|
||||
rdev = repl ? conf->mirrors[dev].replacement :
|
||||
conf->mirrors[dev].rdev;
|
||||
|
||||
raid_end_discard_bio(r10_bio);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
@ -1785,11 +1739,11 @@ retry_discard:
|
||||
* inc refcount on their rdev. Record them by setting
|
||||
* bios[x] to bio
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (disk = 0; disk < geo->raid_disks; disk++) {
|
||||
struct md_rdev *rdev, *rrdev;
|
||||
|
||||
rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
rrdev = conf->mirrors[disk].replacement;
|
||||
r10_bio->devs[disk].bio = NULL;
|
||||
r10_bio->devs[disk].repl_bio = NULL;
|
||||
|
||||
@ -1809,7 +1763,6 @@ retry_discard:
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
atomic_set(&r10_bio->remaining, 1);
|
||||
for (disk = 0; disk < geo->raid_disks; disk++) {
|
||||
@ -1939,6 +1892,8 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
|
||||
struct r10conf *conf = mddev->private;
|
||||
int i;
|
||||
|
||||
lockdep_assert_held(&mddev->lock);
|
||||
|
||||
if (conf->geo.near_copies < conf->geo.raid_disks)
|
||||
seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
|
||||
if (conf->geo.near_copies > 1)
|
||||
@ -1953,12 +1908,11 @@ static void raid10_status(struct seq_file *seq, struct mddev *mddev)
|
||||
}
|
||||
seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
|
||||
conf->geo.raid_disks - mddev->degraded);
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
|
||||
|
||||
seq_printf(seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_printf(seq, "]");
|
||||
}
|
||||
|
||||
@ -1980,7 +1934,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
|
||||
ncopies = conf->geo.near_copies;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
do {
|
||||
int n = conf->copies;
|
||||
int cnt = 0;
|
||||
@ -1988,7 +1941,7 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
|
||||
while (n--) {
|
||||
struct md_rdev *rdev;
|
||||
if (this != ignore &&
|
||||
(rdev = rcu_dereference(conf->mirrors[this].rdev)) &&
|
||||
(rdev = conf->mirrors[this].rdev) &&
|
||||
test_bit(In_sync, &rdev->flags))
|
||||
cnt++;
|
||||
this = (this+1) % disks;
|
||||
@ -1999,7 +1952,6 @@ static int _enough(struct r10conf *conf, int previous, int ignore)
|
||||
} while (first != 0);
|
||||
has_enough = 1;
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return has_enough;
|
||||
}
|
||||
|
||||
@ -2072,8 +2024,7 @@ static void print_conf(struct r10conf *conf)
|
||||
pr_debug(" --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
|
||||
conf->geo.raid_disks);
|
||||
|
||||
/* This is only called with ->reconfix_mutex held, so
|
||||
* rcu protection of rdev is not needed */
|
||||
lockdep_assert_held(&conf->mddev->reconfig_mutex);
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
rdev = conf->mirrors[i].rdev;
|
||||
if (rdev)
|
||||
@ -2190,7 +2141,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
err = 0;
|
||||
if (rdev->saved_raid_disk != mirror)
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p->rdev, rdev);
|
||||
WRITE_ONCE(p->rdev, rdev);
|
||||
break;
|
||||
}
|
||||
|
||||
@ -2204,7 +2155,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p->replacement, rdev);
|
||||
WRITE_ONCE(p->replacement, rdev);
|
||||
}
|
||||
|
||||
print_conf(conf);
|
||||
@ -2246,24 +2197,12 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
*rdevp = NULL;
|
||||
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
|
||||
synchronize_rcu();
|
||||
if (atomic_read(&rdev->nr_pending)) {
|
||||
/* lost the race, try later */
|
||||
err = -EBUSY;
|
||||
*rdevp = rdev;
|
||||
goto abort;
|
||||
}
|
||||
}
|
||||
WRITE_ONCE(*rdevp, NULL);
|
||||
if (p->replacement) {
|
||||
/* We must have just cleared 'rdev' */
|
||||
p->rdev = p->replacement;
|
||||
WRITE_ONCE(p->rdev, p->replacement);
|
||||
clear_bit(Replacement, &p->replacement->flags);
|
||||
smp_mb(); /* Make sure other CPUs may see both as identical
|
||||
* but will never see neither -- if they are careful.
|
||||
*/
|
||||
p->replacement = NULL;
|
||||
WRITE_ONCE(p->replacement, NULL);
|
||||
}
|
||||
|
||||
clear_bit(WantReplacement, &rdev->flags);
|
||||
@ -2655,42 +2594,6 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Used by fix_read_error() to decay the per rdev read_errors.
|
||||
* We halve the read error count for every hour that has elapsed
|
||||
* since the last recorded read error.
|
||||
*
|
||||
*/
|
||||
static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
long cur_time_mon;
|
||||
unsigned long hours_since_last;
|
||||
unsigned int read_errors = atomic_read(&rdev->read_errors);
|
||||
|
||||
cur_time_mon = ktime_get_seconds();
|
||||
|
||||
if (rdev->last_read_error == 0) {
|
||||
/* first time we've seen a read error */
|
||||
rdev->last_read_error = cur_time_mon;
|
||||
return;
|
||||
}
|
||||
|
||||
hours_since_last = (long)(cur_time_mon -
|
||||
rdev->last_read_error) / 3600;
|
||||
|
||||
rdev->last_read_error = cur_time_mon;
|
||||
|
||||
/*
|
||||
* if hours_since_last is > the number of bits in read_errors
|
||||
* just set read errors to 0. We do this to avoid
|
||||
* overflowing the shift of read_errors by hours_since_last.
|
||||
*/
|
||||
if (hours_since_last >= 8 * sizeof(read_errors))
|
||||
atomic_set(&rdev->read_errors, 0);
|
||||
else
|
||||
atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
|
||||
}
|
||||
|
||||
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
||||
int sectors, struct page *page, enum req_op op)
|
||||
{
|
||||
@ -2728,7 +2631,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
int sect = 0; /* Offset from r10_bio->sector */
|
||||
int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
|
||||
struct md_rdev *rdev;
|
||||
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
|
||||
int d = r10_bio->devs[slot].devnum;
|
||||
|
||||
/* still own a reference to this rdev, so it cannot
|
||||
@ -2741,15 +2643,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
more fix_read_error() attempts */
|
||||
return;
|
||||
|
||||
check_decay_read_errors(mddev, rdev);
|
||||
atomic_inc(&rdev->read_errors);
|
||||
if (atomic_read(&rdev->read_errors) > max_read_errors) {
|
||||
pr_notice("md/raid10:%s: %pg: Raid device exceeded read_error threshold [cur %d:max %d]\n",
|
||||
mdname(mddev), rdev->bdev,
|
||||
atomic_read(&rdev->read_errors), max_read_errors);
|
||||
pr_notice("md/raid10:%s: %pg: Failing raid device\n",
|
||||
mdname(mddev), rdev->bdev);
|
||||
md_error(mddev, rdev);
|
||||
if (exceed_read_errors(mddev, rdev)) {
|
||||
r10_bio->devs[slot].bio = IO_BLOCKED;
|
||||
return;
|
||||
}
|
||||
@ -2763,20 +2657,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
if (s > (PAGE_SIZE>>9))
|
||||
s = PAGE_SIZE >> 9;
|
||||
|
||||
rcu_read_lock();
|
||||
do {
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
d = r10_bio->devs[sl].devnum;
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags) &&
|
||||
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
|
||||
&first_bad, &bad_sectors) == 0) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
success = sync_page_io(rdev,
|
||||
r10_bio->devs[sl].addr +
|
||||
sect,
|
||||
@ -2784,7 +2676,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
conf->tmppage,
|
||||
REQ_OP_READ, false);
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
if (success)
|
||||
break;
|
||||
}
|
||||
@ -2792,7 +2683,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
if (sl == conf->copies)
|
||||
sl = 0;
|
||||
} while (sl != slot);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!success) {
|
||||
/* Cannot read from anywhere, just mark the block
|
||||
@ -2816,20 +2706,18 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
|
||||
start = sl;
|
||||
/* write it back and re-read */
|
||||
rcu_read_lock();
|
||||
while (sl != slot) {
|
||||
if (sl==0)
|
||||
sl = conf->copies;
|
||||
sl--;
|
||||
d = r10_bio->devs[sl].devnum;
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (!rdev ||
|
||||
test_bit(Faulty, &rdev->flags) ||
|
||||
!test_bit(In_sync, &rdev->flags))
|
||||
continue;
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
if (r10_sync_page_io(rdev,
|
||||
r10_bio->devs[sl].addr +
|
||||
sect,
|
||||
@ -2848,7 +2736,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
rdev->bdev);
|
||||
}
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
sl = start;
|
||||
while (sl != slot) {
|
||||
@ -2856,14 +2743,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
sl = conf->copies;
|
||||
sl--;
|
||||
d = r10_bio->devs[sl].devnum;
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (!rdev ||
|
||||
test_bit(Faulty, &rdev->flags) ||
|
||||
!test_bit(In_sync, &rdev->flags))
|
||||
continue;
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
switch (r10_sync_page_io(rdev,
|
||||
r10_bio->devs[sl].addr +
|
||||
sect,
|
||||
@ -2891,9 +2777,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
||||
}
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
sectors -= s;
|
||||
sect += s;
|
||||
@ -3367,14 +3251,13 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
/* Completed a full sync so the replacements
|
||||
* are now fully recovered.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
struct md_rdev *rdev =
|
||||
rcu_dereference(conf->mirrors[i].replacement);
|
||||
conf->mirrors[i].replacement;
|
||||
|
||||
if (rdev)
|
||||
rdev->recovery_offset = MaxSector;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
conf->fullsync = 0;
|
||||
}
|
||||
@ -3455,9 +3338,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
struct raid10_info *mirror = &conf->mirrors[i];
|
||||
struct md_rdev *mrdev, *mreplace;
|
||||
|
||||
rcu_read_lock();
|
||||
mrdev = rcu_dereference(mirror->rdev);
|
||||
mreplace = rcu_dereference(mirror->replacement);
|
||||
mrdev = mirror->rdev;
|
||||
mreplace = mirror->replacement;
|
||||
|
||||
if (mrdev && (test_bit(Faulty, &mrdev->flags) ||
|
||||
test_bit(In_sync, &mrdev->flags)))
|
||||
@ -3465,22 +3347,18 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
if (mreplace && test_bit(Faulty, &mreplace->flags))
|
||||
mreplace = NULL;
|
||||
|
||||
if (!mrdev && !mreplace) {
|
||||
rcu_read_unlock();
|
||||
if (!mrdev && !mreplace)
|
||||
continue;
|
||||
}
|
||||
|
||||
still_degraded = 0;
|
||||
/* want to reconstruct this device */
|
||||
rb2 = r10_bio;
|
||||
sect = raid10_find_virt(conf, sector_nr, i);
|
||||
if (sect >= mddev->resync_max_sectors) {
|
||||
if (sect >= mddev->resync_max_sectors)
|
||||
/* last stripe is not complete - don't
|
||||
* try to recover this sector.
|
||||
*/
|
||||
rcu_read_unlock();
|
||||
continue;
|
||||
}
|
||||
/* Unless we are doing a full sync, or a replacement
|
||||
* we only need to recover the block if it is set in
|
||||
* the bitmap
|
||||
@ -3496,14 +3374,12 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
* that there will never be anything to do here
|
||||
*/
|
||||
chunks_skipped = -1;
|
||||
rcu_read_unlock();
|
||||
continue;
|
||||
}
|
||||
if (mrdev)
|
||||
atomic_inc(&mrdev->nr_pending);
|
||||
if (mreplace)
|
||||
atomic_inc(&mreplace->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
r10_bio = raid10_alloc_init_r10buf(conf);
|
||||
r10_bio->state = 0;
|
||||
@ -3522,10 +3398,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
/* Need to check if the array will still be
|
||||
* degraded
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (j = 0; j < conf->geo.raid_disks; j++) {
|
||||
struct md_rdev *rdev = rcu_dereference(
|
||||
conf->mirrors[j].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[j].rdev;
|
||||
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
|
||||
still_degraded = 1;
|
||||
break;
|
||||
@ -3540,8 +3415,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
int k;
|
||||
int d = r10_bio->devs[j].devnum;
|
||||
sector_t from_addr, to_addr;
|
||||
struct md_rdev *rdev =
|
||||
rcu_dereference(conf->mirrors[d].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[d].rdev;
|
||||
sector_t sector, first_bad;
|
||||
int bad_sectors;
|
||||
if (!rdev ||
|
||||
@ -3620,7 +3494,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (j == conf->copies) {
|
||||
/* Cannot recover, so abort the recovery or
|
||||
* record a bad block */
|
||||
@ -3747,12 +3620,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
|
||||
bio = r10_bio->devs[i].bio;
|
||||
bio->bi_status = BLK_STS_IOERR;
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
|
||||
rcu_read_unlock();
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
}
|
||||
|
||||
sector = r10_bio->devs[i].addr;
|
||||
if (is_badblock(rdev, sector, max_sync,
|
||||
&first_bad, &bad_sectors)) {
|
||||
@ -3762,7 +3633,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bad_sectors -= (sector - first_bad);
|
||||
if (max_sync > bad_sectors)
|
||||
max_sync = bad_sectors;
|
||||
rcu_read_unlock();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@ -3778,11 +3648,10 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio_set_dev(bio, rdev->bdev);
|
||||
count++;
|
||||
|
||||
rdev = rcu_dereference(conf->mirrors[d].replacement);
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags)) {
|
||||
rcu_read_unlock();
|
||||
rdev = conf->mirrors[d].replacement;
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
}
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
|
||||
/* Need to set up for writing to the replacement */
|
||||
@ -3799,7 +3668,6 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
|
||||
bio->bi_iter.bi_sector = sector + rdev->data_offset;
|
||||
bio_set_dev(bio, rdev->bdev);
|
||||
count++;
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
if (count < 2) {
|
||||
@ -4509,11 +4377,11 @@ static int calc_degraded(struct r10conf *conf)
|
||||
int degraded, degraded2;
|
||||
int i;
|
||||
|
||||
rcu_read_lock();
|
||||
degraded = 0;
|
||||
/* 'prev' section first */
|
||||
for (i = 0; i < conf->prev.raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
||||
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
degraded++;
|
||||
else if (!test_bit(In_sync, &rdev->flags))
|
||||
@ -4523,13 +4391,12 @@ static int calc_degraded(struct r10conf *conf)
|
||||
*/
|
||||
degraded++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (conf->geo.raid_disks == conf->prev.raid_disks)
|
||||
return degraded;
|
||||
rcu_read_lock();
|
||||
degraded2 = 0;
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[i].rdev;
|
||||
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
degraded2++;
|
||||
else if (!test_bit(In_sync, &rdev->flags)) {
|
||||
@ -4542,7 +4409,6 @@ static int calc_degraded(struct r10conf *conf)
|
||||
degraded2++;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (degraded2 > degraded)
|
||||
return degraded2;
|
||||
return degraded;
|
||||
@ -4974,16 +4840,15 @@ read_more:
|
||||
blist = read_bio;
|
||||
read_bio->bi_next = NULL;
|
||||
|
||||
rcu_read_lock();
|
||||
for (s = 0; s < conf->copies*2; s++) {
|
||||
struct bio *b;
|
||||
int d = r10_bio->devs[s/2].devnum;
|
||||
struct md_rdev *rdev2;
|
||||
if (s&1) {
|
||||
rdev2 = rcu_dereference(conf->mirrors[d].replacement);
|
||||
rdev2 = conf->mirrors[d].replacement;
|
||||
b = r10_bio->devs[s/2].repl_bio;
|
||||
} else {
|
||||
rdev2 = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev2 = conf->mirrors[d].rdev;
|
||||
b = r10_bio->devs[s/2].bio;
|
||||
}
|
||||
if (!rdev2 || test_bit(Faulty, &rdev2->flags))
|
||||
@ -5017,7 +4882,6 @@ read_more:
|
||||
sector_nr += len >> 9;
|
||||
nr_sectors += len >> 9;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
r10_bio->sectors = nr_sectors;
|
||||
|
||||
/* Now submit the read */
|
||||
@ -5070,20 +4934,17 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
struct bio *b;
|
||||
int d = r10_bio->devs[s/2].devnum;
|
||||
struct md_rdev *rdev;
|
||||
rcu_read_lock();
|
||||
if (s&1) {
|
||||
rdev = rcu_dereference(conf->mirrors[d].replacement);
|
||||
rdev = conf->mirrors[d].replacement;
|
||||
b = r10_bio->devs[s/2].repl_bio;
|
||||
} else {
|
||||
rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
b = r10_bio->devs[s/2].bio;
|
||||
}
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags)) {
|
||||
rcu_read_unlock();
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
}
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
md_sync_acct_bio(b, r10_bio->sectors);
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
b->bi_next = NULL;
|
||||
@ -5154,10 +5015,9 @@ static int handle_reshape_read_error(struct mddev *mddev,
|
||||
if (s > (PAGE_SIZE >> 9))
|
||||
s = PAGE_SIZE >> 9;
|
||||
|
||||
rcu_read_lock();
|
||||
while (!success) {
|
||||
int d = r10b->devs[slot].devnum;
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[d].rdev;
|
||||
sector_t addr;
|
||||
if (rdev == NULL ||
|
||||
test_bit(Faulty, &rdev->flags) ||
|
||||
@ -5166,14 +5026,12 @@ static int handle_reshape_read_error(struct mddev *mddev,
|
||||
|
||||
addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
success = sync_page_io(rdev,
|
||||
addr,
|
||||
s << 9,
|
||||
pages[idx],
|
||||
REQ_OP_READ, false);
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
rcu_read_lock();
|
||||
if (success)
|
||||
break;
|
||||
failed:
|
||||
@ -5183,7 +5041,6 @@ static int handle_reshape_read_error(struct mddev *mddev,
|
||||
if (slot == first_slot)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (!success) {
|
||||
/* couldn't read this block, must give up */
|
||||
set_bit(MD_RECOVERY_INTR,
|
||||
@ -5209,12 +5066,8 @@ static void end_reshape_write(struct bio *bio)
|
||||
struct md_rdev *rdev = NULL;
|
||||
|
||||
d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
|
||||
if (repl)
|
||||
rdev = conf->mirrors[d].replacement;
|
||||
if (!rdev) {
|
||||
smp_mb();
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
}
|
||||
rdev = repl ? conf->mirrors[d].replacement :
|
||||
conf->mirrors[d].rdev;
|
||||
|
||||
if (bio->bi_status) {
|
||||
/* FIXME should record badblock */
|
||||
@ -5249,18 +5102,16 @@ static void raid10_finish_reshape(struct mddev *mddev)
|
||||
mddev->resync_max_sectors = mddev->array_sectors;
|
||||
} else {
|
||||
int d;
|
||||
rcu_read_lock();
|
||||
for (d = conf->geo.raid_disks ;
|
||||
d < conf->geo.raid_disks - mddev->delta_disks;
|
||||
d++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
|
||||
struct md_rdev *rdev = conf->mirrors[d].rdev;
|
||||
if (rdev)
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
rdev = rcu_dereference(conf->mirrors[d].replacement);
|
||||
rdev = conf->mirrors[d].replacement;
|
||||
if (rdev)
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
mddev->layout = mddev->new_layout;
|
||||
mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
|
||||
|
@ -1890,28 +1890,22 @@ r5l_recovery_replay_one_stripe(struct r5conf *conf,
|
||||
continue;
|
||||
|
||||
/* in case device is broken */
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->disks[disk_index].rdev);
|
||||
rdev = conf->disks[disk_index].rdev;
|
||||
if (rdev) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
sync_page_io(rdev, sh->sector, PAGE_SIZE,
|
||||
sh->dev[disk_index].page, REQ_OP_WRITE,
|
||||
false);
|
||||
rdev_dec_pending(rdev, rdev->mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
rrdev = rcu_dereference(conf->disks[disk_index].replacement);
|
||||
rrdev = conf->disks[disk_index].replacement;
|
||||
if (rrdev) {
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
sync_page_io(rrdev, sh->sector, PAGE_SIZE,
|
||||
sh->dev[disk_index].page, REQ_OP_WRITE,
|
||||
false);
|
||||
rdev_dec_pending(rrdev, rrdev->mddev);
|
||||
rcu_read_lock();
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
ctx->data_parity_stripes++;
|
||||
out:
|
||||
@ -2948,7 +2942,6 @@ bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect)
|
||||
if (!log)
|
||||
return false;
|
||||
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
tree_index = r5c_tree_index(conf, sect);
|
||||
slot = radix_tree_lookup(&log->big_stripe_tree, tree_index);
|
||||
return slot != NULL;
|
||||
|
@ -620,11 +620,9 @@ static void ppl_do_flush(struct ppl_io_unit *io)
|
||||
struct md_rdev *rdev;
|
||||
struct block_device *bdev = NULL;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags))
|
||||
bdev = rdev->bdev;
|
||||
rcu_read_unlock();
|
||||
|
||||
if (bdev) {
|
||||
struct bio *bio;
|
||||
@ -882,9 +880,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
|
||||
(unsigned long long)r_sector, dd_idx,
|
||||
(unsigned long long)sector);
|
||||
|
||||
/* Array has not started so rcu dereference is safe */
|
||||
rdev = rcu_dereference_protected(
|
||||
conf->disks[dd_idx].rdev, 1);
|
||||
rdev = conf->disks[dd_idx].rdev;
|
||||
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
|
||||
sector >= rdev->recovery_offset)) {
|
||||
pr_debug("%s:%*s data member disk %d missing\n",
|
||||
@ -936,9 +932,7 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
|
||||
0, &disk, &sh);
|
||||
BUG_ON(sh.pd_idx != le32_to_cpu(e->parity_disk));
|
||||
|
||||
/* Array has not started so rcu dereference is safe */
|
||||
parity_rdev = rcu_dereference_protected(
|
||||
conf->disks[sh.pd_idx].rdev, 1);
|
||||
parity_rdev = conf->disks[sh.pd_idx].rdev;
|
||||
|
||||
BUG_ON(parity_rdev->bdev->bd_dev != log->rdev->bdev->bd_dev);
|
||||
pr_debug("%s:%*s write parity at sector %llu, disk %pg\n",
|
||||
@ -1404,9 +1398,7 @@ int ppl_init_log(struct r5conf *conf)
|
||||
|
||||
for (i = 0; i < ppl_conf->count; i++) {
|
||||
struct ppl_log *log = &ppl_conf->child_logs[i];
|
||||
/* Array has not started so rcu dereference is safe */
|
||||
struct md_rdev *rdev =
|
||||
rcu_dereference_protected(conf->disks[i].rdev, 1);
|
||||
struct md_rdev *rdev = conf->disks[i].rdev;
|
||||
|
||||
mutex_init(&log->io_mutex);
|
||||
spin_lock_init(&log->io_list_lock);
|
||||
|
@ -36,7 +36,6 @@
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/raid/pq.h>
|
||||
#include <linux/async_tx.h>
|
||||
@ -694,12 +693,12 @@ int raid5_calc_degraded(struct r5conf *conf)
|
||||
int degraded, degraded2;
|
||||
int i;
|
||||
|
||||
rcu_read_lock();
|
||||
degraded = 0;
|
||||
for (i = 0; i < conf->previous_raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
|
||||
|
||||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
rdev = READ_ONCE(conf->disks[i].replacement);
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
degraded++;
|
||||
else if (test_bit(In_sync, &rdev->flags))
|
||||
@ -717,15 +716,14 @@ int raid5_calc_degraded(struct r5conf *conf)
|
||||
if (conf->raid_disks >= conf->previous_raid_disks)
|
||||
degraded++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (conf->raid_disks == conf->previous_raid_disks)
|
||||
return degraded;
|
||||
rcu_read_lock();
|
||||
degraded2 = 0;
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
|
||||
|
||||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
rdev = READ_ONCE(conf->disks[i].replacement);
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
degraded2++;
|
||||
else if (test_bit(In_sync, &rdev->flags))
|
||||
@ -739,7 +737,6 @@ int raid5_calc_degraded(struct r5conf *conf)
|
||||
if (conf->raid_disks <= conf->previous_raid_disks)
|
||||
degraded2++;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (degraded2 > degraded)
|
||||
return degraded2;
|
||||
return degraded;
|
||||
@ -1184,14 +1181,8 @@ again:
|
||||
bi = &dev->req;
|
||||
rbi = &dev->rreq; /* For writing to replacement */
|
||||
|
||||
rcu_read_lock();
|
||||
rrdev = rcu_dereference(conf->disks[i].replacement);
|
||||
smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
|
||||
rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
if (!rdev) {
|
||||
rdev = rrdev;
|
||||
rrdev = NULL;
|
||||
}
|
||||
rdev = conf->disks[i].rdev;
|
||||
rrdev = conf->disks[i].replacement;
|
||||
if (op_is_write(op)) {
|
||||
if (replace_only)
|
||||
rdev = NULL;
|
||||
@ -1212,7 +1203,6 @@ again:
|
||||
rrdev = NULL;
|
||||
if (rrdev)
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
/* We have already checked bad blocks for reads. Now
|
||||
* need to check for writes. We never accept write errors
|
||||
@ -2731,28 +2721,6 @@ static void shrink_stripes(struct r5conf *conf)
|
||||
conf->slab_cache = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* This helper wraps rcu_dereference_protected() and can be used when
|
||||
* it is known that the nr_pending of the rdev is elevated.
|
||||
*/
|
||||
static struct md_rdev *rdev_pend_deref(struct md_rdev __rcu *rdev)
|
||||
{
|
||||
return rcu_dereference_protected(rdev,
|
||||
atomic_read(&rcu_access_pointer(rdev)->nr_pending));
|
||||
}
|
||||
|
||||
/*
|
||||
* This helper wraps rcu_dereference_protected() and should be used
|
||||
* when it is known that the mddev_lock() is held. This is safe
|
||||
* seeing raid5_remove_disk() has the same lock held.
|
||||
*/
|
||||
static struct md_rdev *rdev_mdlock_deref(struct mddev *mddev,
|
||||
struct md_rdev __rcu *rdev)
|
||||
{
|
||||
return rcu_dereference_protected(rdev,
|
||||
lockdep_is_held(&mddev->reconfig_mutex));
|
||||
}
|
||||
|
||||
static void raid5_end_read_request(struct bio * bi)
|
||||
{
|
||||
struct stripe_head *sh = bi->bi_private;
|
||||
@ -2778,9 +2746,9 @@ static void raid5_end_read_request(struct bio * bi)
|
||||
* In that case it moved down to 'rdev'.
|
||||
* rdev is not removed until all requests are finished.
|
||||
*/
|
||||
rdev = rdev_pend_deref(conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
if (!rdev)
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
|
||||
if (use_new_offset(conf, sh))
|
||||
s = sh->sector + rdev->new_data_offset;
|
||||
@ -2893,11 +2861,11 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
|
||||
for (i = 0 ; i < disks; i++) {
|
||||
if (bi == &sh->dev[i].req) {
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
break;
|
||||
}
|
||||
if (bi == &sh->dev[i].rreq) {
|
||||
rdev = rdev_pend_deref(conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
if (rdev)
|
||||
replacement = 1;
|
||||
else
|
||||
@ -2905,7 +2873,7 @@ static void raid5_end_write_request(struct bio *bi)
|
||||
* replaced it. rdev is not removed
|
||||
* until all requests are finished.
|
||||
*/
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -3667,15 +3635,13 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
|
||||
int bitmap_end = 0;
|
||||
|
||||
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
|
||||
struct md_rdev *rdev;
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = conf->disks[i].rdev;
|
||||
|
||||
if (rdev && test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags))
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
else
|
||||
rdev = NULL;
|
||||
rcu_read_unlock();
|
||||
if (rdev) {
|
||||
if (!rdev_set_badblocks(
|
||||
rdev,
|
||||
@ -3793,16 +3759,17 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
|
||||
/* During recovery devices cannot be removed, so
|
||||
* locking and refcounting of rdevs is not needed
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = conf->disks[i].rdev;
|
||||
|
||||
if (rdev
|
||||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
&& !rdev_set_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
abort = 1;
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
|
||||
if (rdev
|
||||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
@ -3810,7 +3777,6 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
abort = 1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
if (abort)
|
||||
conf->recovery_disabled =
|
||||
conf->mddev->recovery_disabled;
|
||||
@ -3823,15 +3789,13 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
|
||||
struct md_rdev *rdev;
|
||||
int rv = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
rdev = rcu_dereference(sh->raid_conf->disks[disk_idx].replacement);
|
||||
rdev = sh->raid_conf->disks[disk_idx].replacement;
|
||||
if (rdev
|
||||
&& !test_bit(Faulty, &rdev->flags)
|
||||
&& !test_bit(In_sync, &rdev->flags)
|
||||
&& (rdev->recovery_offset <= sh->sector
|
||||
|| rdev->mddev->recovery_cp <= sh->sector))
|
||||
rv = 1;
|
||||
rcu_read_unlock();
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -4708,7 +4672,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
s->log_failed = r5l_log_disk_error(conf);
|
||||
|
||||
/* Now to look around and see what can be done */
|
||||
rcu_read_lock();
|
||||
for (i=disks; i--; ) {
|
||||
struct md_rdev *rdev;
|
||||
sector_t first_bad;
|
||||
@ -4753,7 +4716,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
/* Prefer to use the replacement for reads, but only
|
||||
* if it is recovered enough and has no bad blocks.
|
||||
*/
|
||||
rdev = rcu_dereference(conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
|
||||
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
@ -4764,7 +4727,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
set_bit(R5_NeedReplace, &dev->flags);
|
||||
else
|
||||
clear_bit(R5_NeedReplace, &dev->flags);
|
||||
rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
clear_bit(R5_ReadRepl, &dev->flags);
|
||||
}
|
||||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
@ -4811,8 +4774,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
if (test_bit(R5_WriteError, &dev->flags)) {
|
||||
/* This flag does not apply to '.replacement'
|
||||
* only to .rdev, so make sure to check that*/
|
||||
struct md_rdev *rdev2 = rcu_dereference(
|
||||
conf->disks[i].rdev);
|
||||
struct md_rdev *rdev2 = conf->disks[i].rdev;
|
||||
|
||||
if (rdev2 == rdev)
|
||||
clear_bit(R5_Insync, &dev->flags);
|
||||
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
|
||||
@ -4824,8 +4787,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
if (test_bit(R5_MadeGood, &dev->flags)) {
|
||||
/* This flag does not apply to '.replacement'
|
||||
* only to .rdev, so make sure to check that*/
|
||||
struct md_rdev *rdev2 = rcu_dereference(
|
||||
conf->disks[i].rdev);
|
||||
struct md_rdev *rdev2 = conf->disks[i].rdev;
|
||||
|
||||
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
|
||||
s->handle_bad_blocks = 1;
|
||||
atomic_inc(&rdev2->nr_pending);
|
||||
@ -4833,8 +4796,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
clear_bit(R5_MadeGood, &dev->flags);
|
||||
}
|
||||
if (test_bit(R5_MadeGoodRepl, &dev->flags)) {
|
||||
struct md_rdev *rdev2 = rcu_dereference(
|
||||
conf->disks[i].replacement);
|
||||
struct md_rdev *rdev2 = conf->disks[i].replacement;
|
||||
|
||||
if (rdev2 && !test_bit(Faulty, &rdev2->flags)) {
|
||||
s->handle_bad_blocks = 1;
|
||||
atomic_inc(&rdev2->nr_pending);
|
||||
@ -4855,8 +4818,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags))
|
||||
do_recovery = 1;
|
||||
else if (!rdev) {
|
||||
rdev = rcu_dereference(
|
||||
conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags))
|
||||
do_recovery = 1;
|
||||
}
|
||||
@ -4883,7 +4845,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
||||
else
|
||||
s->replacing = 1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -5340,23 +5301,23 @@ finish:
|
||||
struct r5dev *dev = &sh->dev[i];
|
||||
if (test_and_clear_bit(R5_WriteError, &dev->flags)) {
|
||||
/* We own a safe reference to the rdev */
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
if (!rdev_set_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0))
|
||||
md_error(conf->mddev, rdev);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
rdev_clear_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
|
||||
rdev = rdev_pend_deref(conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].replacement;
|
||||
if (!rdev)
|
||||
/* rdev have been moved down */
|
||||
rdev = rdev_pend_deref(conf->disks[i].rdev);
|
||||
rdev = conf->disks[i].rdev;
|
||||
rdev_clear_badblocks(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf), 0);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
@ -5515,24 +5476,22 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
|
||||
&dd_idx, NULL);
|
||||
end_sector = sector + bio_sectors(raid_bio);
|
||||
|
||||
rcu_read_lock();
|
||||
if (r5c_big_stripe_cached(conf, sector))
|
||||
goto out_rcu_unlock;
|
||||
return 0;
|
||||
|
||||
rdev = rcu_dereference(conf->disks[dd_idx].replacement);
|
||||
rdev = conf->disks[dd_idx].replacement;
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags) ||
|
||||
rdev->recovery_offset < end_sector) {
|
||||
rdev = rcu_dereference(conf->disks[dd_idx].rdev);
|
||||
rdev = conf->disks[dd_idx].rdev;
|
||||
if (!rdev)
|
||||
goto out_rcu_unlock;
|
||||
return 0;
|
||||
if (test_bit(Faulty, &rdev->flags) ||
|
||||
!(test_bit(In_sync, &rdev->flags) ||
|
||||
rdev->recovery_offset >= end_sector))
|
||||
goto out_rcu_unlock;
|
||||
return 0;
|
||||
}
|
||||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
|
||||
&bad_sectors)) {
|
||||
@ -5576,10 +5535,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
|
||||
raid_bio->bi_iter.bi_sector);
|
||||
submit_bio_noacct(align_bio);
|
||||
return 1;
|
||||
|
||||
out_rcu_unlock:
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
|
||||
@ -6582,14 +6537,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
|
||||
* Note in case of > 1 drive failures it's possible we're rebuilding
|
||||
* one drive while leaving another faulty drive in array.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = conf->disks[i].rdev;
|
||||
|
||||
if (rdev == NULL || test_bit(Faulty, &rdev->flags))
|
||||
still_degraded = 1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
|
||||
|
||||
@ -6820,18 +6773,7 @@ static void raid5d(struct md_thread *thread)
|
||||
spin_unlock_irq(&conf->device_lock);
|
||||
md_check_recovery(mddev);
|
||||
spin_lock_irq(&conf->device_lock);
|
||||
|
||||
/*
|
||||
* Waiting on MD_SB_CHANGE_PENDING below may deadlock
|
||||
* seeing md_check_recovery() is needed to clear
|
||||
* the flag when using mdmon.
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
wait_event_lock_irq(mddev->sb_wait,
|
||||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
|
||||
conf->device_lock);
|
||||
}
|
||||
pr_debug("%d stripes handled\n", handled);
|
||||
|
||||
@ -7911,18 +7853,10 @@ static int raid5_run(struct mddev *mddev)
|
||||
|
||||
for (i = 0; i < conf->raid_disks && conf->previous_raid_disks;
|
||||
i++) {
|
||||
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
|
||||
if (!rdev && conf->disks[i].replacement) {
|
||||
/* The replacement is all we have yet */
|
||||
rdev = rdev_mdlock_deref(mddev,
|
||||
conf->disks[i].replacement);
|
||||
conf->disks[i].replacement = NULL;
|
||||
clear_bit(Replacement, &rdev->flags);
|
||||
rcu_assign_pointer(conf->disks[i].rdev, rdev);
|
||||
}
|
||||
rdev = conf->disks[i].rdev;
|
||||
if (!rdev)
|
||||
continue;
|
||||
if (rcu_access_pointer(conf->disks[i].replacement) &&
|
||||
if (conf->disks[i].replacement &&
|
||||
conf->reshape_progress != MaxSector) {
|
||||
/* replacements and reshape simply do not mix. */
|
||||
pr_warn("md: cannot handle concurrent replacement and reshape.\n");
|
||||
@ -8106,15 +8040,16 @@ static void raid5_status(struct seq_file *seq, struct mddev *mddev)
|
||||
struct r5conf *conf = mddev->private;
|
||||
int i;
|
||||
|
||||
lockdep_assert_held(&mddev->lock);
|
||||
|
||||
seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
|
||||
conf->chunk_sectors / 2, mddev->layout);
|
||||
seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->disks[i].rdev);
|
||||
struct md_rdev *rdev = READ_ONCE(conf->disks[i].rdev);
|
||||
|
||||
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_printf (seq, "]");
|
||||
}
|
||||
|
||||
@ -8152,9 +8087,8 @@ static int raid5_spare_active(struct mddev *mddev)
|
||||
unsigned long flags;
|
||||
|
||||
for (i = 0; i < conf->raid_disks; i++) {
|
||||
rdev = rdev_mdlock_deref(mddev, conf->disks[i].rdev);
|
||||
replacement = rdev_mdlock_deref(mddev,
|
||||
conf->disks[i].replacement);
|
||||
rdev = conf->disks[i].rdev;
|
||||
replacement = conf->disks[i].replacement;
|
||||
if (replacement
|
||||
&& replacement->recovery_offset == MaxSector
|
||||
&& !test_bit(Faulty, &replacement->flags)
|
||||
@ -8193,7 +8127,7 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
struct r5conf *conf = mddev->private;
|
||||
int err = 0;
|
||||
int number = rdev->raid_disk;
|
||||
struct md_rdev __rcu **rdevp;
|
||||
struct md_rdev **rdevp;
|
||||
struct disk_info *p;
|
||||
struct md_rdev *tmp;
|
||||
|
||||
@ -8216,9 +8150,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
if (unlikely(number >= conf->pool_size))
|
||||
return 0;
|
||||
p = conf->disks + number;
|
||||
if (rdev == rcu_access_pointer(p->rdev))
|
||||
if (rdev == p->rdev)
|
||||
rdevp = &p->rdev;
|
||||
else if (rdev == rcu_access_pointer(p->replacement))
|
||||
else if (rdev == p->replacement)
|
||||
rdevp = &p->replacement;
|
||||
else
|
||||
return 0;
|
||||
@ -8238,37 +8172,24 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
mddev->recovery_disabled != conf->recovery_disabled &&
|
||||
!has_failed(conf) &&
|
||||
(!rcu_access_pointer(p->replacement) ||
|
||||
rcu_access_pointer(p->replacement) == rdev) &&
|
||||
(!p->replacement || p->replacement == rdev) &&
|
||||
number < conf->raid_disks) {
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
*rdevp = NULL;
|
||||
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
|
||||
lockdep_assert_held(&mddev->reconfig_mutex);
|
||||
synchronize_rcu();
|
||||
if (atomic_read(&rdev->nr_pending)) {
|
||||
/* lost the race, try later */
|
||||
err = -EBUSY;
|
||||
rcu_assign_pointer(*rdevp, rdev);
|
||||
}
|
||||
}
|
||||
WRITE_ONCE(*rdevp, NULL);
|
||||
if (!err) {
|
||||
err = log_modify(conf, rdev, false);
|
||||
if (err)
|
||||
goto abort;
|
||||
}
|
||||
|
||||
tmp = rcu_access_pointer(p->replacement);
|
||||
tmp = p->replacement;
|
||||
if (tmp) {
|
||||
/* We must have just cleared 'rdev' */
|
||||
rcu_assign_pointer(p->rdev, tmp);
|
||||
WRITE_ONCE(p->rdev, tmp);
|
||||
clear_bit(Replacement, &tmp->flags);
|
||||
smp_mb(); /* Make sure other CPUs may see both as identical
|
||||
* but will never see neither - if they are careful
|
||||
*/
|
||||
rcu_assign_pointer(p->replacement, NULL);
|
||||
WRITE_ONCE(p->replacement, NULL);
|
||||
|
||||
if (!err)
|
||||
err = log_modify(conf, tmp, true);
|
||||
@ -8336,7 +8257,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
rdev->raid_disk = disk;
|
||||
if (rdev->saved_raid_disk != disk)
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p->rdev, rdev);
|
||||
WRITE_ONCE(p->rdev, rdev);
|
||||
|
||||
err = log_modify(conf, rdev, true);
|
||||
|
||||
@ -8345,7 +8266,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
}
|
||||
for (disk = first; disk <= last; disk++) {
|
||||
p = conf->disks + disk;
|
||||
tmp = rdev_mdlock_deref(mddev, p->rdev);
|
||||
tmp = p->rdev;
|
||||
if (test_bit(WantReplacement, &tmp->flags) &&
|
||||
mddev->reshape_position == MaxSector &&
|
||||
p->replacement == NULL) {
|
||||
@ -8354,7 +8275,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
rdev->raid_disk = disk;
|
||||
err = 0;
|
||||
conf->fullsync = 1;
|
||||
rcu_assign_pointer(p->replacement, rdev);
|
||||
WRITE_ONCE(p->replacement, rdev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -8487,7 +8408,7 @@ static int raid5_start_reshape(struct mddev *mddev)
|
||||
if (mddev->recovery_cp < MaxSector)
|
||||
return -EBUSY;
|
||||
for (i = 0; i < conf->raid_disks; i++)
|
||||
if (rdev_mdlock_deref(mddev, conf->disks[i].replacement))
|
||||
if (conf->disks[i].replacement)
|
||||
return -EBUSY;
|
||||
|
||||
rdev_for_each(rdev, mddev) {
|
||||
@ -8658,12 +8579,10 @@ static void raid5_finish_reshape(struct mddev *mddev)
|
||||
for (d = conf->raid_disks ;
|
||||
d < conf->raid_disks - mddev->delta_disks;
|
||||
d++) {
|
||||
rdev = rdev_mdlock_deref(mddev,
|
||||
conf->disks[d].rdev);
|
||||
rdev = conf->disks[d].rdev;
|
||||
if (rdev)
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
rdev = rdev_mdlock_deref(mddev,
|
||||
conf->disks[d].replacement);
|
||||
rdev = conf->disks[d].replacement;
|
||||
if (rdev)
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
}
|
||||
|
@ -473,8 +473,8 @@ enum {
|
||||
*/
|
||||
|
||||
struct disk_info {
|
||||
struct md_rdev __rcu *rdev;
|
||||
struct md_rdev __rcu *replacement;
|
||||
struct md_rdev *rdev;
|
||||
struct md_rdev *replacement;
|
||||
struct page *extra_page; /* extra page to use in prexor */
|
||||
};
|
||||
|
||||
|
@ -376,10 +376,8 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
|
||||
blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
|
||||
|
||||
if (tr->discard) {
|
||||
if (tr->discard)
|
||||
blk_queue_max_discard_sectors(new->rq, UINT_MAX);
|
||||
new->rq->limits.discard_granularity = tr->blksize;
|
||||
}
|
||||
|
||||
gd->queue = new->rq;
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/nvme_ioctl.h>
|
||||
#include <linux/pm_qos.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <asm/unaligned.h>
|
||||
|
||||
#include "nvme.h"
|
||||
@ -312,12 +313,12 @@ static void nvme_log_error(struct request *req)
|
||||
struct nvme_request *nr = nvme_req(req);
|
||||
|
||||
if (ns) {
|
||||
pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %llu blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
|
||||
pr_err_ratelimited("%s: %s(0x%x) @ LBA %llu, %u blocks, %s (sct 0x%x / sc 0x%x) %s%s\n",
|
||||
ns->disk ? ns->disk->disk_name : "?",
|
||||
nvme_get_opcode_str(nr->cmd->common.opcode),
|
||||
nr->cmd->common.opcode,
|
||||
(unsigned long long)nvme_sect_to_lba(ns, blk_rq_pos(req)),
|
||||
(unsigned long long)blk_rq_bytes(req) >> ns->lba_shift,
|
||||
nvme_sect_to_lba(ns->head, blk_rq_pos(req)),
|
||||
blk_rq_bytes(req) >> ns->head->lba_shift,
|
||||
nvme_get_error_status_str(nr->status),
|
||||
nr->status >> 8 & 7, /* Status Code Type */
|
||||
nr->status & 0xff, /* Status Code */
|
||||
@ -372,9 +373,12 @@ static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
|
||||
static inline void nvme_end_req_zoned(struct request *req)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
|
||||
req_op(req) == REQ_OP_ZONE_APPEND)
|
||||
req->__sector = nvme_lba_to_sect(req->q->queuedata,
|
||||
req_op(req) == REQ_OP_ZONE_APPEND) {
|
||||
struct nvme_ns *ns = req->q->queuedata;
|
||||
|
||||
req->__sector = nvme_lba_to_sect(ns->head,
|
||||
le64_to_cpu(nvme_req(req)->result.u64));
|
||||
}
|
||||
}
|
||||
|
||||
static inline void nvme_end_req(struct request *req)
|
||||
@ -793,8 +797,8 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
||||
}
|
||||
|
||||
if (queue_max_discard_segments(req->q) == 1) {
|
||||
u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req));
|
||||
u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9);
|
||||
u64 slba = nvme_sect_to_lba(ns->head, blk_rq_pos(req));
|
||||
u32 nlb = blk_rq_sectors(req) >> (ns->head->lba_shift - 9);
|
||||
|
||||
range[0].cattr = cpu_to_le32(0);
|
||||
range[0].nlb = cpu_to_le32(nlb);
|
||||
@ -802,8 +806,9 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
|
||||
n = 1;
|
||||
} else {
|
||||
__rq_for_each_bio(bio, req) {
|
||||
u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector);
|
||||
u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift;
|
||||
u64 slba = nvme_sect_to_lba(ns->head,
|
||||
bio->bi_iter.bi_sector);
|
||||
u32 nlb = bio->bi_iter.bi_size >> ns->head->lba_shift;
|
||||
|
||||
if (n < segments) {
|
||||
range[n].cattr = cpu_to_le32(0);
|
||||
@ -841,7 +846,7 @@ static void nvme_set_ref_tag(struct nvme_ns *ns, struct nvme_command *cmnd,
|
||||
u64 ref48;
|
||||
|
||||
/* both rw and write zeroes share the same reftag format */
|
||||
switch (ns->guard_type) {
|
||||
switch (ns->head->guard_type) {
|
||||
case NVME_NVM_NS_16B_GUARD:
|
||||
cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
|
||||
break;
|
||||
@ -869,17 +874,18 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
|
||||
cmnd->write_zeroes.opcode = nvme_cmd_write_zeroes;
|
||||
cmnd->write_zeroes.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
cmnd->write_zeroes.slba =
|
||||
cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
||||
cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
|
||||
cmnd->write_zeroes.length =
|
||||
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
||||
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
|
||||
|
||||
if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
|
||||
if (!(req->cmd_flags & REQ_NOUNMAP) &&
|
||||
(ns->head->features & NVME_NS_DEAC))
|
||||
cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
|
||||
|
||||
if (nvme_ns_has_pi(ns)) {
|
||||
if (nvme_ns_has_pi(ns->head)) {
|
||||
cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
|
||||
|
||||
switch (ns->pi_type) {
|
||||
switch (ns->head->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE1:
|
||||
case NVME_NS_DPS_PI_TYPE2:
|
||||
nvme_set_ref_tag(ns, cmnd, req);
|
||||
@ -911,13 +917,15 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
cmnd->rw.cdw2 = 0;
|
||||
cmnd->rw.cdw3 = 0;
|
||||
cmnd->rw.metadata = 0;
|
||||
cmnd->rw.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
||||
cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
|
||||
cmnd->rw.slba =
|
||||
cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
|
||||
cmnd->rw.length =
|
||||
cpu_to_le16((blk_rq_bytes(req) >> ns->head->lba_shift) - 1);
|
||||
cmnd->rw.reftag = 0;
|
||||
cmnd->rw.apptag = 0;
|
||||
cmnd->rw.appmask = 0;
|
||||
|
||||
if (ns->ms) {
|
||||
if (ns->head->ms) {
|
||||
/*
|
||||
* If formated with metadata, the block layer always provides a
|
||||
* metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else
|
||||
@ -925,12 +933,12 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
|
||||
* namespace capacity to zero to prevent any I/O.
|
||||
*/
|
||||
if (!blk_integrity_rq(req)) {
|
||||
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns)))
|
||||
if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
|
||||
return BLK_STS_NOTSUPP;
|
||||
control |= NVME_RW_PRINFO_PRACT;
|
||||
}
|
||||
|
||||
switch (ns->pi_type) {
|
||||
switch (ns->head->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE3:
|
||||
control |= NVME_RW_PRINFO_PRCHK_GUARD;
|
||||
break;
|
||||
@ -1452,7 +1460,7 @@ free_data:
|
||||
return status;
|
||||
}
|
||||
|
||||
static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
||||
int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
||||
struct nvme_id_ns **id)
|
||||
{
|
||||
struct nvme_command c = { };
|
||||
@ -1671,14 +1679,14 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
||||
u32 max_integrity_segments)
|
||||
static void nvme_init_integrity(struct gendisk *disk,
|
||||
struct nvme_ns_head *head, u32 max_integrity_segments)
|
||||
{
|
||||
struct blk_integrity integrity = { };
|
||||
|
||||
switch (ns->pi_type) {
|
||||
switch (head->pi_type) {
|
||||
case NVME_NS_DPS_PI_TYPE3:
|
||||
switch (ns->guard_type) {
|
||||
switch (head->guard_type) {
|
||||
case NVME_NVM_NS_16B_GUARD:
|
||||
integrity.profile = &t10_pi_type3_crc;
|
||||
integrity.tag_size = sizeof(u16) + sizeof(u32);
|
||||
@ -1696,7 +1704,7 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
||||
break;
|
||||
case NVME_NS_DPS_PI_TYPE1:
|
||||
case NVME_NS_DPS_PI_TYPE2:
|
||||
switch (ns->guard_type) {
|
||||
switch (head->guard_type) {
|
||||
case NVME_NVM_NS_16B_GUARD:
|
||||
integrity.profile = &t10_pi_type1_crc;
|
||||
integrity.tag_size = sizeof(u16);
|
||||
@ -1717,25 +1725,26 @@ static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
||||
break;
|
||||
}
|
||||
|
||||
integrity.tuple_size = ns->ms;
|
||||
integrity.tuple_size = head->ms;
|
||||
blk_integrity_register(disk, &integrity);
|
||||
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
|
||||
}
|
||||
#else
|
||||
static void nvme_init_integrity(struct gendisk *disk, struct nvme_ns *ns,
|
||||
u32 max_integrity_segments)
|
||||
static void nvme_init_integrity(struct gendisk *disk,
|
||||
struct nvme_ns_head *head, u32 max_integrity_segments)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
|
||||
static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns)
|
||||
static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
|
||||
struct nvme_ns_head *head)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
struct request_queue *queue = disk->queue;
|
||||
u32 size = queue_logical_block_size(queue);
|
||||
|
||||
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX))
|
||||
ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl);
|
||||
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX))
|
||||
ctrl->max_discard_sectors =
|
||||
nvme_lba_to_sect(head, ctrl->dmrsl);
|
||||
|
||||
if (ctrl->max_discard_sectors == 0) {
|
||||
blk_queue_max_discard_sectors(queue, 0);
|
||||
@ -1766,21 +1775,21 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
|
||||
a->csi == b->csi;
|
||||
}
|
||||
|
||||
static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
|
||||
struct nvme_id_ns *id)
|
||||
{
|
||||
bool first = id->dps & NVME_NS_DPS_PI_FIRST;
|
||||
unsigned lbaf = nvme_lbaf_index(id->flbas);
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
struct nvme_command c = { };
|
||||
struct nvme_id_ns_nvm *nvm;
|
||||
int ret = 0;
|
||||
u32 elbaf;
|
||||
|
||||
ns->pi_size = 0;
|
||||
ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
|
||||
head->pi_size = 0;
|
||||
head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
|
||||
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
|
||||
ns->pi_size = sizeof(struct t10_pi_tuple);
|
||||
ns->guard_type = NVME_NVM_NS_16B_GUARD;
|
||||
head->pi_size = sizeof(struct t10_pi_tuple);
|
||||
head->guard_type = NVME_NVM_NS_16B_GUARD;
|
||||
goto set_pi;
|
||||
}
|
||||
|
||||
@ -1789,11 +1798,11 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
return -ENOMEM;
|
||||
|
||||
c.identify.opcode = nvme_admin_identify;
|
||||
c.identify.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
c.identify.nsid = cpu_to_le32(head->ns_id);
|
||||
c.identify.cns = NVME_ID_CNS_CS_NS;
|
||||
c.identify.csi = NVME_CSI_NVM;
|
||||
|
||||
ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, nvm, sizeof(*nvm));
|
||||
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
|
||||
if (ret)
|
||||
goto free_data;
|
||||
|
||||
@ -1803,13 +1812,13 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
if (nvme_elbaf_sts(elbaf))
|
||||
goto free_data;
|
||||
|
||||
ns->guard_type = nvme_elbaf_guard_type(elbaf);
|
||||
switch (ns->guard_type) {
|
||||
head->guard_type = nvme_elbaf_guard_type(elbaf);
|
||||
switch (head->guard_type) {
|
||||
case NVME_NVM_NS_64B_GUARD:
|
||||
ns->pi_size = sizeof(struct crc64_pi_tuple);
|
||||
head->pi_size = sizeof(struct crc64_pi_tuple);
|
||||
break;
|
||||
case NVME_NVM_NS_16B_GUARD:
|
||||
ns->pi_size = sizeof(struct t10_pi_tuple);
|
||||
head->pi_size = sizeof(struct t10_pi_tuple);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@ -1818,25 +1827,25 @@ static int nvme_init_ms(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
free_data:
|
||||
kfree(nvm);
|
||||
set_pi:
|
||||
if (ns->pi_size && (first || ns->ms == ns->pi_size))
|
||||
ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
|
||||
if (head->pi_size && (first || head->ms == head->pi_size))
|
||||
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
|
||||
else
|
||||
ns->pi_type = 0;
|
||||
head->pi_type = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
|
||||
struct nvme_ns_head *head, struct nvme_id_ns *id)
|
||||
{
|
||||
struct nvme_ctrl *ctrl = ns->ctrl;
|
||||
int ret;
|
||||
|
||||
ret = nvme_init_ms(ns, id);
|
||||
ret = nvme_init_ms(ctrl, head, id);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ns->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
|
||||
if (!ns->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
|
||||
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
|
||||
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
|
||||
return 0;
|
||||
|
||||
if (ctrl->ops->flags & NVME_F_FABRICS) {
|
||||
@ -1848,7 +1857,7 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
|
||||
return 0;
|
||||
|
||||
ns->features |= NVME_NS_EXT_LBAS;
|
||||
head->features |= NVME_NS_EXT_LBAS;
|
||||
|
||||
/*
|
||||
* The current fabrics transport drivers support namespace
|
||||
@ -1859,8 +1868,8 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
* Note, this check will need to be modified if any drivers
|
||||
* gain the ability to use other metadata formats.
|
||||
*/
|
||||
if (ctrl->max_integrity_segments && nvme_ns_has_pi(ns))
|
||||
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
||||
if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
|
||||
head->features |= NVME_NS_METADATA_SUPPORTED;
|
||||
} else {
|
||||
/*
|
||||
* For PCIe controllers, we can't easily remap the separate
|
||||
@ -1869,9 +1878,9 @@ static int nvme_configure_metadata(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
* We allow extended LBAs for the passthrough interface, though.
|
||||
*/
|
||||
if (id->flbas & NVME_NS_FLBAS_META_EXT)
|
||||
ns->features |= NVME_NS_EXT_LBAS;
|
||||
head->features |= NVME_NS_EXT_LBAS;
|
||||
else
|
||||
ns->features |= NVME_NS_METADATA_SUPPORTED;
|
||||
head->features |= NVME_NS_METADATA_SUPPORTED;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@ -1894,11 +1903,11 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
|
||||
blk_queue_write_cache(q, vwc, vwc);
|
||||
}
|
||||
|
||||
static void nvme_update_disk_info(struct gendisk *disk,
|
||||
struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
|
||||
struct nvme_ns_head *head, struct nvme_id_ns *id)
|
||||
{
|
||||
sector_t capacity = nvme_lba_to_sect(ns, le64_to_cpu(id->nsze));
|
||||
u32 bs = 1U << ns->lba_shift;
|
||||
sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
|
||||
u32 bs = 1U << head->lba_shift;
|
||||
u32 atomic_bs, phys_bs, io_opt = 0;
|
||||
|
||||
/*
|
||||
@ -1906,7 +1915,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
|
||||
* or smaller than a sector size yet, so catch this early and don't
|
||||
* allow block I/O.
|
||||
*/
|
||||
if (ns->lba_shift > PAGE_SHIFT || ns->lba_shift < SECTOR_SHIFT) {
|
||||
if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
|
||||
capacity = 0;
|
||||
bs = (1 << 9);
|
||||
}
|
||||
@ -1923,7 +1932,7 @@ static void nvme_update_disk_info(struct gendisk *disk,
|
||||
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
|
||||
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
|
||||
else
|
||||
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
|
||||
atomic_bs = (1 + ctrl->subsys->awupf) * bs;
|
||||
}
|
||||
|
||||
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
|
||||
@ -1949,20 +1958,20 @@ static void nvme_update_disk_info(struct gendisk *disk,
|
||||
* I/O to namespaces with metadata except when the namespace supports
|
||||
* PI, as it can strip/insert in that case.
|
||||
*/
|
||||
if (ns->ms) {
|
||||
if (head->ms) {
|
||||
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
|
||||
(ns->features & NVME_NS_METADATA_SUPPORTED))
|
||||
nvme_init_integrity(disk, ns,
|
||||
ns->ctrl->max_integrity_segments);
|
||||
else if (!nvme_ns_has_pi(ns))
|
||||
(head->features & NVME_NS_METADATA_SUPPORTED))
|
||||
nvme_init_integrity(disk, head,
|
||||
ctrl->max_integrity_segments);
|
||||
else if (!nvme_ns_has_pi(head))
|
||||
capacity = 0;
|
||||
}
|
||||
|
||||
set_capacity_and_notify(disk, capacity);
|
||||
|
||||
nvme_config_discard(disk, ns);
|
||||
nvme_config_discard(ctrl, disk, head);
|
||||
blk_queue_max_write_zeroes_sectors(disk->queue,
|
||||
ns->ctrl->max_zeroes_sectors);
|
||||
ctrl->max_zeroes_sectors);
|
||||
}
|
||||
|
||||
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
|
||||
@ -1985,7 +1994,7 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
|
||||
is_power_of_2(ctrl->max_hw_sectors))
|
||||
iob = ctrl->max_hw_sectors;
|
||||
else
|
||||
iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob));
|
||||
iob = nvme_lba_to_sect(ns->head, le16_to_cpu(id->noiob));
|
||||
|
||||
if (!iob)
|
||||
return;
|
||||
@ -2052,16 +2061,17 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
|
||||
blk_mq_freeze_queue(ns->disk->queue);
|
||||
lbaf = nvme_lbaf_index(id->flbas);
|
||||
ns->lba_shift = id->lbaf[lbaf].ds;
|
||||
ns->head->lba_shift = id->lbaf[lbaf].ds;
|
||||
ns->head->nuse = le64_to_cpu(id->nuse);
|
||||
nvme_set_queue_limits(ns->ctrl, ns->queue);
|
||||
|
||||
ret = nvme_configure_metadata(ns, id);
|
||||
ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
|
||||
if (ret < 0) {
|
||||
blk_mq_unfreeze_queue(ns->disk->queue);
|
||||
goto out;
|
||||
}
|
||||
nvme_set_chunk_sectors(ns, id);
|
||||
nvme_update_disk_info(ns->disk, ns, id);
|
||||
nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
|
||||
|
||||
if (ns->head->ids.csi == NVME_CSI_ZNS) {
|
||||
ret = nvme_update_zone_info(ns, lbaf);
|
||||
@ -2078,7 +2088,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
* do not return zeroes.
|
||||
*/
|
||||
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
|
||||
ns->features |= NVME_NS_DEAC;
|
||||
ns->head->features |= NVME_NS_DEAC;
|
||||
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
|
||||
set_bit(NVME_NS_READY, &ns->flags);
|
||||
blk_mq_unfreeze_queue(ns->disk->queue);
|
||||
@ -2091,7 +2101,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
|
||||
|
||||
if (nvme_ns_head_multipath(ns->head)) {
|
||||
blk_mq_freeze_queue(ns->head->disk->queue);
|
||||
nvme_update_disk_info(ns->head->disk, ns, id);
|
||||
nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
|
||||
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
|
||||
nvme_mpath_revalidate_paths(ns);
|
||||
blk_stack_limits(&ns->head->disk->queue->limits,
|
||||
@ -3026,6 +3036,42 @@ static int nvme_init_effects(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
|
||||
{
|
||||
/*
|
||||
* In fabrics we need to verify the cntlid matches the
|
||||
* admin connect
|
||||
*/
|
||||
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
|
||||
dev_err(ctrl->device,
|
||||
"Mismatching cntlid: Connect %u vs Identify %u, rejecting\n",
|
||||
ctrl->cntlid, le16_to_cpu(id->cntlid));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
|
||||
dev_err(ctrl->device,
|
||||
"keep-alive support is mandatory for fabrics\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!nvme_discovery_ctrl(ctrl) && ctrl->ioccsz < 4) {
|
||||
dev_err(ctrl->device,
|
||||
"I/O queue command capsule supported size %d < 4\n",
|
||||
ctrl->ioccsz);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!nvme_discovery_ctrl(ctrl) && ctrl->iorcsz < 1) {
|
||||
dev_err(ctrl->device,
|
||||
"I/O queue response capsule supported size %d < 1\n",
|
||||
ctrl->iorcsz);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
{
|
||||
struct nvme_id_ctrl *id;
|
||||
@ -3138,25 +3184,9 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
|
||||
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
|
||||
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
|
||||
|
||||
/*
|
||||
* In fabrics we need to verify the cntlid matches the
|
||||
* admin connect
|
||||
*/
|
||||
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
|
||||
dev_err(ctrl->device,
|
||||
"Mismatching cntlid: Connect %u vs Identify "
|
||||
"%u, rejecting\n",
|
||||
ctrl->cntlid, le16_to_cpu(id->cntlid));
|
||||
ret = -EINVAL;
|
||||
ret = nvme_check_ctrl_fabric_info(ctrl, id);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) {
|
||||
dev_err(ctrl->device,
|
||||
"keep-alive support is mandatory for fabrics\n");
|
||||
ret = -EINVAL;
|
||||
goto out_free;
|
||||
}
|
||||
} else {
|
||||
ctrl->hmpre = le32_to_cpu(id->hmpre);
|
||||
ctrl->hmmin = le32_to_cpu(id->hmmin);
|
||||
@ -3415,6 +3445,8 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
|
||||
head->ns_id = info->nsid;
|
||||
head->ids = info->ids;
|
||||
head->shared = info->is_shared;
|
||||
ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
|
||||
ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
|
||||
kref_init(&head->ref);
|
||||
|
||||
if (head->ids.csi) {
|
||||
@ -3674,7 +3706,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
|
||||
up_write(&ctrl->namespaces_rwsem);
|
||||
nvme_get_ctrl(ctrl);
|
||||
|
||||
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
|
||||
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
|
||||
goto out_cleanup_ns_from_list;
|
||||
|
||||
if (!nvme_ns_head_multipath(ns->head))
|
||||
|
@ -3498,10 +3498,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
|
||||
|
||||
ctrl->ctrl.opts = opts;
|
||||
ctrl->ctrl.nr_reconnects = 0;
|
||||
if (lport->dev)
|
||||
ctrl->ctrl.numa_node = dev_to_node(lport->dev);
|
||||
else
|
||||
ctrl->ctrl.numa_node = NUMA_NO_NODE;
|
||||
INIT_LIST_HEAD(&ctrl->ctrl_list);
|
||||
ctrl->lport = lport;
|
||||
ctrl->rport = rport;
|
||||
@ -3546,6 +3542,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
|
||||
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
|
||||
if (ret)
|
||||
goto out_free_queues;
|
||||
if (lport->dev)
|
||||
ctrl->ctrl.numa_node = dev_to_node(lport->dev);
|
||||
|
||||
/* at this point, teardown path changes to ref counting on nvme ctrl */
|
||||
|
||||
|
@ -97,58 +97,6 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
|
||||
return (void __user *)ptrval;
|
||||
}
|
||||
|
||||
static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
|
||||
unsigned len, u32 seed)
|
||||
{
|
||||
struct bio_integrity_payload *bip;
|
||||
int ret = -ENOMEM;
|
||||
void *buf;
|
||||
struct bio *bio = req->bio;
|
||||
|
||||
buf = kmalloc(len, GFP_KERNEL);
|
||||
if (!buf)
|
||||
goto out;
|
||||
|
||||
if (req_op(req) == REQ_OP_DRV_OUT) {
|
||||
ret = -EFAULT;
|
||||
if (copy_from_user(buf, ubuf, len))
|
||||
goto out_free_meta;
|
||||
} else {
|
||||
memset(buf, 0, len);
|
||||
}
|
||||
|
||||
bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
|
||||
if (IS_ERR(bip)) {
|
||||
ret = PTR_ERR(bip);
|
||||
goto out_free_meta;
|
||||
}
|
||||
|
||||
bip->bip_iter.bi_sector = seed;
|
||||
ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
|
||||
offset_in_page(buf));
|
||||
if (ret != len) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free_meta;
|
||||
}
|
||||
|
||||
req->cmd_flags |= REQ_INTEGRITY;
|
||||
return buf;
|
||||
out_free_meta:
|
||||
kfree(buf);
|
||||
out:
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
|
||||
static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
|
||||
void *meta, unsigned len, int ret)
|
||||
{
|
||||
if (!ret && req_op(req) == REQ_OP_DRV_IN &&
|
||||
copy_to_user(ubuf, meta, len))
|
||||
ret = -EFAULT;
|
||||
kfree(meta);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct request *nvme_alloc_user_request(struct request_queue *q,
|
||||
struct nvme_command *cmd, blk_opf_t rq_flags,
|
||||
blk_mq_req_flags_t blk_flags)
|
||||
@ -165,14 +113,12 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
|
||||
|
||||
static int nvme_map_user_request(struct request *req, u64 ubuffer,
|
||||
unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
|
||||
u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
|
||||
unsigned int flags)
|
||||
u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
|
||||
{
|
||||
struct request_queue *q = req->q;
|
||||
struct nvme_ns *ns = q->queuedata;
|
||||
struct block_device *bdev = ns ? ns->disk->part0 : NULL;
|
||||
struct bio *bio = NULL;
|
||||
void *meta = NULL;
|
||||
int ret;
|
||||
|
||||
if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
|
||||
@ -194,18 +140,17 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
|
||||
|
||||
if (ret)
|
||||
goto out;
|
||||
bio = req->bio;
|
||||
if (bdev)
|
||||
bio_set_dev(bio, bdev);
|
||||
|
||||
if (bdev && meta_buffer && meta_len) {
|
||||
meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
|
||||
meta_seed);
|
||||
if (IS_ERR(meta)) {
|
||||
ret = PTR_ERR(meta);
|
||||
goto out_unmap;
|
||||
bio = req->bio;
|
||||
if (bdev) {
|
||||
bio_set_dev(bio, bdev);
|
||||
if (meta_buffer && meta_len) {
|
||||
ret = bio_integrity_map_user(bio, meta_buffer, meta_len,
|
||||
meta_seed);
|
||||
if (ret)
|
||||
goto out_unmap;
|
||||
req->cmd_flags |= REQ_INTEGRITY;
|
||||
}
|
||||
*metap = meta;
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -226,7 +171,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
struct nvme_ns *ns = q->queuedata;
|
||||
struct nvme_ctrl *ctrl;
|
||||
struct request *req;
|
||||
void *meta = NULL;
|
||||
struct bio *bio;
|
||||
u32 effects;
|
||||
int ret;
|
||||
@ -238,7 +182,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
req->timeout = timeout;
|
||||
if (ubuffer && bufflen) {
|
||||
ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
|
||||
meta_len, meta_seed, &meta, NULL, flags);
|
||||
meta_len, meta_seed, NULL, flags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
@ -250,9 +194,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
|
||||
ret = nvme_execute_rq(req, false);
|
||||
if (result)
|
||||
*result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
if (meta)
|
||||
ret = nvme_finish_user_metadata(req, meta_buffer, meta,
|
||||
meta_len, ret);
|
||||
if (bio)
|
||||
blk_rq_unmap_user(bio);
|
||||
blk_mq_free_request(req);
|
||||
@ -284,10 +225,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
length = (io.nblocks + 1) << ns->lba_shift;
|
||||
length = (io.nblocks + 1) << ns->head->lba_shift;
|
||||
|
||||
if ((io.control & NVME_RW_PRINFO_PRACT) &&
|
||||
ns->ms == sizeof(struct t10_pi_tuple)) {
|
||||
ns->head->ms == sizeof(struct t10_pi_tuple)) {
|
||||
/*
|
||||
* Protection information is stripped/inserted by the
|
||||
* controller.
|
||||
@ -297,11 +238,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
|
||||
meta_len = 0;
|
||||
metadata = NULL;
|
||||
} else {
|
||||
meta_len = (io.nblocks + 1) * ns->ms;
|
||||
meta_len = (io.nblocks + 1) * ns->head->ms;
|
||||
metadata = nvme_to_user_ptr(io.metadata);
|
||||
}
|
||||
|
||||
if (ns->features & NVME_NS_EXT_LBAS) {
|
||||
if (ns->head->features & NVME_NS_EXT_LBAS) {
|
||||
length += meta_len;
|
||||
meta_len = 0;
|
||||
} else if (meta_len) {
|
||||
@ -447,19 +388,10 @@ struct nvme_uring_data {
|
||||
* Expect build errors if this grows larger than that.
|
||||
*/
|
||||
struct nvme_uring_cmd_pdu {
|
||||
union {
|
||||
struct bio *bio;
|
||||
struct request *req;
|
||||
};
|
||||
u32 meta_len;
|
||||
u32 nvme_status;
|
||||
union {
|
||||
struct {
|
||||
void *meta; /* kernel-resident buffer */
|
||||
void __user *meta_buffer;
|
||||
};
|
||||
u64 result;
|
||||
} u;
|
||||
struct request *req;
|
||||
struct bio *bio;
|
||||
u64 result;
|
||||
int status;
|
||||
};
|
||||
|
||||
static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
|
||||
@ -468,31 +400,6 @@ static inline struct nvme_uring_cmd_pdu *nvme_uring_cmd_pdu(
|
||||
return (struct nvme_uring_cmd_pdu *)&ioucmd->pdu;
|
||||
}
|
||||
|
||||
static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
struct request *req = pdu->req;
|
||||
int status;
|
||||
u64 result;
|
||||
|
||||
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
|
||||
status = -EINTR;
|
||||
else
|
||||
status = nvme_req(req)->status;
|
||||
|
||||
result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
if (pdu->meta_len)
|
||||
status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
|
||||
pdu->u.meta, pdu->meta_len, status);
|
||||
if (req->bio)
|
||||
blk_rq_unmap_user(req->bio);
|
||||
blk_mq_free_request(req);
|
||||
|
||||
io_uring_cmd_done(ioucmd, status, result, issue_flags);
|
||||
}
|
||||
|
||||
static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
|
||||
unsigned issue_flags)
|
||||
{
|
||||
@ -500,8 +407,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
|
||||
|
||||
if (pdu->bio)
|
||||
blk_rq_unmap_user(pdu->bio);
|
||||
|
||||
io_uring_cmd_done(ioucmd, pdu->nvme_status, pdu->u.result, issue_flags);
|
||||
io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
|
||||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
|
||||
@ -510,53 +416,24 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
|
||||
struct io_uring_cmd *ioucmd = req->end_io_data;
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
|
||||
req->bio = pdu->bio;
|
||||
if (nvme_req(req)->flags & NVME_REQ_CANCELLED) {
|
||||
pdu->nvme_status = -EINTR;
|
||||
} else {
|
||||
pdu->nvme_status = nvme_req(req)->status;
|
||||
if (!pdu->nvme_status)
|
||||
pdu->nvme_status = blk_status_to_errno(err);
|
||||
}
|
||||
pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
|
||||
pdu->status = -EINTR;
|
||||
else
|
||||
pdu->status = nvme_req(req)->status;
|
||||
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
|
||||
|
||||
/*
|
||||
* For iopoll, complete it directly.
|
||||
* Otherwise, move the completion to task work.
|
||||
*/
|
||||
if (blk_rq_is_poll(req)) {
|
||||
WRITE_ONCE(ioucmd->cookie, NULL);
|
||||
if (blk_rq_is_poll(req))
|
||||
nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED);
|
||||
} else {
|
||||
else
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
|
||||
}
|
||||
|
||||
return RQ_END_IO_FREE;
|
||||
}
|
||||
|
||||
static enum rq_end_io_ret nvme_uring_cmd_end_io_meta(struct request *req,
|
||||
blk_status_t err)
|
||||
{
|
||||
struct io_uring_cmd *ioucmd = req->end_io_data;
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
|
||||
req->bio = pdu->bio;
|
||||
pdu->req = req;
|
||||
|
||||
/*
|
||||
* For iopoll, complete it directly.
|
||||
* Otherwise, move the completion to task work.
|
||||
*/
|
||||
if (blk_rq_is_poll(req)) {
|
||||
WRITE_ONCE(ioucmd->cookie, NULL);
|
||||
nvme_uring_task_meta_cb(ioucmd, IO_URING_F_UNLOCKED);
|
||||
} else {
|
||||
io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_meta_cb);
|
||||
}
|
||||
|
||||
return RQ_END_IO_NONE;
|
||||
}
|
||||
|
||||
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
|
||||
{
|
||||
@ -568,7 +445,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
struct request *req;
|
||||
blk_opf_t rq_flags = REQ_ALLOC_CACHE;
|
||||
blk_mq_req_flags_t blk_flags = 0;
|
||||
void *meta = NULL;
|
||||
int ret;
|
||||
|
||||
c.common.opcode = READ_ONCE(cmd->opcode);
|
||||
@ -616,27 +492,16 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
|
||||
if (d.addr && d.data_len) {
|
||||
ret = nvme_map_user_request(req, d.addr,
|
||||
d.data_len, nvme_to_user_ptr(d.metadata),
|
||||
d.metadata_len, 0, &meta, ioucmd, vec);
|
||||
d.metadata_len, 0, ioucmd, vec);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (blk_rq_is_poll(req)) {
|
||||
ioucmd->flags |= IORING_URING_CMD_POLLED;
|
||||
WRITE_ONCE(ioucmd->cookie, req);
|
||||
}
|
||||
|
||||
/* to free bio on completion, as req->bio will be null at that time */
|
||||
pdu->bio = req->bio;
|
||||
pdu->meta_len = d.metadata_len;
|
||||
pdu->req = req;
|
||||
req->end_io_data = ioucmd;
|
||||
if (pdu->meta_len) {
|
||||
pdu->u.meta = meta;
|
||||
pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
|
||||
req->end_io = nvme_uring_cmd_end_io_meta;
|
||||
} else {
|
||||
req->end_io = nvme_uring_cmd_end_io;
|
||||
}
|
||||
req->end_io = nvme_uring_cmd_end_io;
|
||||
blk_execute_rq_nowait(req, false);
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
@ -787,16 +652,12 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
|
||||
struct io_comp_batch *iob,
|
||||
unsigned int poll_flags)
|
||||
{
|
||||
struct request *req;
|
||||
int ret = 0;
|
||||
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
|
||||
struct request *req = pdu->req;
|
||||
|
||||
if (!(ioucmd->flags & IORING_URING_CMD_POLLED))
|
||||
return 0;
|
||||
|
||||
req = READ_ONCE(ioucmd->cookie);
|
||||
if (req && blk_rq_is_poll(req))
|
||||
ret = blk_rq_poll(req, iob, poll_flags);
|
||||
return ret;
|
||||
return blk_rq_poll(req, iob, poll_flags);
|
||||
return 0;
|
||||
}
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
|
||||
|
@ -579,7 +579,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
|
||||
*/
|
||||
if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
|
||||
rc = device_add_disk(&head->subsys->dev, head->disk,
|
||||
nvme_ns_id_attr_groups);
|
||||
nvme_ns_attr_groups);
|
||||
if (rc) {
|
||||
clear_bit(NVME_NSHEAD_DISK_LIVE, &ns->flags);
|
||||
return;
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/t10-pi.h>
|
||||
#include <linux/ratelimit_types.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
@ -450,13 +451,27 @@ struct nvme_ns_head {
|
||||
struct list_head list;
|
||||
struct srcu_struct srcu;
|
||||
struct nvme_subsystem *subsys;
|
||||
unsigned ns_id;
|
||||
struct nvme_ns_ids ids;
|
||||
struct list_head entry;
|
||||
struct kref ref;
|
||||
bool shared;
|
||||
int instance;
|
||||
struct nvme_effects_log *effects;
|
||||
u64 nuse;
|
||||
unsigned ns_id;
|
||||
int lba_shift;
|
||||
u16 ms;
|
||||
u16 pi_size;
|
||||
u8 pi_type;
|
||||
u8 guard_type;
|
||||
u16 sgs;
|
||||
u32 sws;
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
u64 zsze;
|
||||
#endif
|
||||
unsigned long features;
|
||||
|
||||
struct ratelimit_state rs_nuse;
|
||||
|
||||
struct cdev cdev;
|
||||
struct device cdev_device;
|
||||
@ -498,17 +513,6 @@ struct nvme_ns {
|
||||
struct kref kref;
|
||||
struct nvme_ns_head *head;
|
||||
|
||||
int lba_shift;
|
||||
u16 ms;
|
||||
u16 pi_size;
|
||||
u16 sgs;
|
||||
u32 sws;
|
||||
u8 pi_type;
|
||||
u8 guard_type;
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
u64 zsze;
|
||||
#endif
|
||||
unsigned long features;
|
||||
unsigned long flags;
|
||||
#define NVME_NS_REMOVING 0
|
||||
#define NVME_NS_ANA_PENDING 2
|
||||
@ -523,9 +527,9 @@ struct nvme_ns {
|
||||
};
|
||||
|
||||
/* NVMe ns supports metadata actions by the controller (generate/strip) */
|
||||
static inline bool nvme_ns_has_pi(struct nvme_ns *ns)
|
||||
static inline bool nvme_ns_has_pi(struct nvme_ns_head *head)
|
||||
{
|
||||
return ns->pi_type && ns->ms == ns->pi_size;
|
||||
return head->pi_type && head->ms == head->pi_size;
|
||||
}
|
||||
|
||||
struct nvme_ctrl_ops {
|
||||
@ -657,17 +661,17 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
|
||||
/*
|
||||
* Convert a 512B sector number to a device logical block number.
|
||||
*/
|
||||
static inline u64 nvme_sect_to_lba(struct nvme_ns *ns, sector_t sector)
|
||||
static inline u64 nvme_sect_to_lba(struct nvme_ns_head *head, sector_t sector)
|
||||
{
|
||||
return sector >> (ns->lba_shift - SECTOR_SHIFT);
|
||||
return sector >> (head->lba_shift - SECTOR_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a device logical block number to a 512B sector number.
|
||||
*/
|
||||
static inline sector_t nvme_lba_to_sect(struct nvme_ns *ns, u64 lba)
|
||||
static inline sector_t nvme_lba_to_sect(struct nvme_ns_head *head, u64 lba)
|
||||
{
|
||||
return lba << (ns->lba_shift - SECTOR_SHIFT);
|
||||
return lba << (head->lba_shift - SECTOR_SHIFT);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -873,10 +877,12 @@ int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags);
|
||||
int nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
|
||||
unsigned int issue_flags);
|
||||
int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
|
||||
struct nvme_id_ns **id);
|
||||
int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
|
||||
int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
|
||||
|
||||
extern const struct attribute_group *nvme_ns_id_attr_groups[];
|
||||
extern const struct attribute_group *nvme_ns_attr_groups[];
|
||||
extern const struct pr_ops nvme_pr_ops;
|
||||
extern const struct block_device_operations nvme_ns_head_ops;
|
||||
extern const struct attribute_group nvme_dev_attrs_group;
|
||||
|
@ -1423,7 +1423,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
|
||||
goto mr_put;
|
||||
|
||||
nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
|
||||
req->mr->sig_attrs, ns->pi_type);
|
||||
req->mr->sig_attrs, ns->head->pi_type);
|
||||
nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
|
||||
|
||||
ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
|
||||
@ -2017,7 +2017,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
|
||||
queue->pi_support &&
|
||||
(c->common.opcode == nvme_cmd_write ||
|
||||
c->common.opcode == nvme_cmd_read) &&
|
||||
nvme_ns_has_pi(ns))
|
||||
nvme_ns_has_pi(ns->head))
|
||||
req->use_sig_mr = true;
|
||||
else
|
||||
req->use_sig_mr = false;
|
||||
|
@ -114,12 +114,97 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
|
||||
}
|
||||
static DEVICE_ATTR_RO(nsid);
|
||||
|
||||
static struct attribute *nvme_ns_id_attrs[] = {
|
||||
static ssize_t csi_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%u\n", dev_to_ns_head(dev)->ids.csi);
|
||||
}
|
||||
static DEVICE_ATTR_RO(csi);
|
||||
|
||||
static ssize_t metadata_bytes_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%u\n", dev_to_ns_head(dev)->ms);
|
||||
}
|
||||
static DEVICE_ATTR_RO(metadata_bytes);
|
||||
|
||||
static int ns_head_update_nuse(struct nvme_ns_head *head)
|
||||
{
|
||||
struct nvme_id_ns *id;
|
||||
struct nvme_ns *ns;
|
||||
int srcu_idx, ret = -EWOULDBLOCK;
|
||||
|
||||
/* Avoid issuing commands too often by rate limiting the update */
|
||||
if (!__ratelimit(&head->rs_nuse))
|
||||
return 0;
|
||||
|
||||
srcu_idx = srcu_read_lock(&head->srcu);
|
||||
ns = nvme_find_path(head);
|
||||
if (!ns)
|
||||
goto out_unlock;
|
||||
|
||||
ret = nvme_identify_ns(ns->ctrl, head->ns_id, &id);
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
head->nuse = le64_to_cpu(id->nuse);
|
||||
kfree(id);
|
||||
|
||||
out_unlock:
|
||||
srcu_read_unlock(&head->srcu, srcu_idx);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ns_update_nuse(struct nvme_ns *ns)
|
||||
{
|
||||
struct nvme_id_ns *id;
|
||||
int ret;
|
||||
|
||||
/* Avoid issuing commands too often by rate limiting the update. */
|
||||
if (!__ratelimit(&ns->head->rs_nuse))
|
||||
return 0;
|
||||
|
||||
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
|
||||
if (ret)
|
||||
goto out_free_id;
|
||||
|
||||
ns->head->nuse = le64_to_cpu(id->nuse);
|
||||
|
||||
out_free_id:
|
||||
kfree(id);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct nvme_ns_head *head = dev_to_ns_head(dev);
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
struct block_device *bdev = disk->part0;
|
||||
int ret;
|
||||
|
||||
if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
|
||||
bdev->bd_disk->fops == &nvme_ns_head_ops)
|
||||
ret = ns_head_update_nuse(head);
|
||||
else
|
||||
ret = ns_update_nuse(bdev->bd_disk->private_data);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return sysfs_emit(buf, "%llu\n", head->nuse);
|
||||
}
|
||||
static DEVICE_ATTR_RO(nuse);
|
||||
|
||||
static struct attribute *nvme_ns_attrs[] = {
|
||||
&dev_attr_wwid.attr,
|
||||
&dev_attr_uuid.attr,
|
||||
&dev_attr_nguid.attr,
|
||||
&dev_attr_eui.attr,
|
||||
&dev_attr_csi.attr,
|
||||
&dev_attr_nsid.attr,
|
||||
&dev_attr_metadata_bytes.attr,
|
||||
&dev_attr_nuse.attr,
|
||||
#ifdef CONFIG_NVME_MULTIPATH
|
||||
&dev_attr_ana_grpid.attr,
|
||||
&dev_attr_ana_state.attr,
|
||||
@ -127,7 +212,7 @@ static struct attribute *nvme_ns_id_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
|
||||
static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
|
||||
struct attribute *a, int n)
|
||||
{
|
||||
struct device *dev = container_of(kobj, struct device, kobj);
|
||||
@ -157,13 +242,13 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
|
||||
return a->mode;
|
||||
}
|
||||
|
||||
static const struct attribute_group nvme_ns_id_attr_group = {
|
||||
.attrs = nvme_ns_id_attrs,
|
||||
.is_visible = nvme_ns_id_attrs_are_visible,
|
||||
static const struct attribute_group nvme_ns_attr_group = {
|
||||
.attrs = nvme_ns_attrs,
|
||||
.is_visible = nvme_ns_attrs_are_visible,
|
||||
};
|
||||
|
||||
const struct attribute_group *nvme_ns_id_attr_groups[] = {
|
||||
&nvme_ns_id_attr_group,
|
||||
const struct attribute_group *nvme_ns_attr_groups[] = {
|
||||
&nvme_ns_attr_group,
|
||||
NULL,
|
||||
};
|
||||
|
||||
|
@ -11,7 +11,7 @@ int nvme_revalidate_zones(struct nvme_ns *ns)
|
||||
{
|
||||
struct request_queue *q = ns->queue;
|
||||
|
||||
blk_queue_chunk_sectors(q, ns->zsze);
|
||||
blk_queue_chunk_sectors(q, ns->head->zsze);
|
||||
blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
|
||||
|
||||
return blk_revalidate_disk_zones(ns->disk, NULL);
|
||||
@ -99,16 +99,17 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
|
||||
goto free_data;
|
||||
}
|
||||
|
||||
ns->zsze = nvme_lba_to_sect(ns, le64_to_cpu(id->lbafe[lbaf].zsze));
|
||||
if (!is_power_of_2(ns->zsze)) {
|
||||
ns->head->zsze =
|
||||
nvme_lba_to_sect(ns->head, le64_to_cpu(id->lbafe[lbaf].zsze));
|
||||
if (!is_power_of_2(ns->head->zsze)) {
|
||||
dev_warn(ns->ctrl->device,
|
||||
"invalid zone size:%llu for namespace:%u\n",
|
||||
ns->zsze, ns->head->ns_id);
|
||||
ns->head->zsze, ns->head->ns_id);
|
||||
status = -ENODEV;
|
||||
goto free_data;
|
||||
}
|
||||
|
||||
disk_set_zoned(ns->disk, BLK_ZONED_HM);
|
||||
disk_set_zoned(ns->disk);
|
||||
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
|
||||
disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
|
||||
disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
|
||||
@ -128,7 +129,7 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
|
||||
sizeof(struct nvme_zone_descriptor);
|
||||
|
||||
nr_zones = min_t(unsigned int, nr_zones,
|
||||
get_capacity(ns->disk) >> ilog2(ns->zsze));
|
||||
get_capacity(ns->disk) >> ilog2(ns->head->zsze));
|
||||
|
||||
bufsize = sizeof(struct nvme_zone_report) +
|
||||
nr_zones * sizeof(struct nvme_zone_descriptor);
|
||||
@ -147,7 +148,8 @@ static void *nvme_zns_alloc_report_buffer(struct nvme_ns *ns,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int nvme_zone_parse_entry(struct nvme_ns *ns,
|
||||
static int nvme_zone_parse_entry(struct nvme_ctrl *ctrl,
|
||||
struct nvme_ns_head *head,
|
||||
struct nvme_zone_descriptor *entry,
|
||||
unsigned int idx, report_zones_cb cb,
|
||||
void *data)
|
||||
@ -155,20 +157,20 @@ static int nvme_zone_parse_entry(struct nvme_ns *ns,
|
||||
struct blk_zone zone = { };
|
||||
|
||||
if ((entry->zt & 0xf) != NVME_ZONE_TYPE_SEQWRITE_REQ) {
|
||||
dev_err(ns->ctrl->device, "invalid zone type %#x\n",
|
||||
dev_err(ctrl->device, "invalid zone type %#x\n",
|
||||
entry->zt);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
zone.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
|
||||
zone.cond = entry->zs >> 4;
|
||||
zone.len = ns->zsze;
|
||||
zone.capacity = nvme_lba_to_sect(ns, le64_to_cpu(entry->zcap));
|
||||
zone.start = nvme_lba_to_sect(ns, le64_to_cpu(entry->zslba));
|
||||
zone.len = head->zsze;
|
||||
zone.capacity = nvme_lba_to_sect(head, le64_to_cpu(entry->zcap));
|
||||
zone.start = nvme_lba_to_sect(head, le64_to_cpu(entry->zslba));
|
||||
if (zone.cond == BLK_ZONE_COND_FULL)
|
||||
zone.wp = zone.start + zone.len;
|
||||
else
|
||||
zone.wp = nvme_lba_to_sect(ns, le64_to_cpu(entry->wp));
|
||||
zone.wp = nvme_lba_to_sect(head, le64_to_cpu(entry->wp));
|
||||
|
||||
return cb(&zone, idx, data);
|
||||
}
|
||||
@ -196,11 +198,11 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
|
||||
c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
|
||||
c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;
|
||||
|
||||
sector &= ~(ns->zsze - 1);
|
||||
sector &= ~(ns->head->zsze - 1);
|
||||
while (zone_idx < nr_zones && sector < get_capacity(ns->disk)) {
|
||||
memset(report, 0, buflen);
|
||||
|
||||
c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns, sector));
|
||||
c.zmr.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, sector));
|
||||
ret = nvme_submit_sync_cmd(ns->queue, &c, report, buflen);
|
||||
if (ret) {
|
||||
if (ret > 0)
|
||||
@ -213,14 +215,15 @@ int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
|
||||
break;
|
||||
|
||||
for (i = 0; i < nz && zone_idx < nr_zones; i++) {
|
||||
ret = nvme_zone_parse_entry(ns, &report->entries[i],
|
||||
ret = nvme_zone_parse_entry(ns->ctrl, ns->head,
|
||||
&report->entries[i],
|
||||
zone_idx, cb, data);
|
||||
if (ret)
|
||||
goto out_free;
|
||||
zone_idx++;
|
||||
}
|
||||
|
||||
sector += ns->zsze * nz;
|
||||
sector += ns->head->zsze * nz;
|
||||
}
|
||||
|
||||
if (zone_idx > 0)
|
||||
@ -239,7 +242,7 @@ blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
|
||||
|
||||
c->zms.opcode = nvme_cmd_zone_mgmt_send;
|
||||
c->zms.nsid = cpu_to_le32(ns->head->ns_id);
|
||||
c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns, blk_rq_pos(req)));
|
||||
c->zms.slba = cpu_to_le64(nvme_sect_to_lba(ns->head, blk_rq_pos(req)));
|
||||
c->zms.zsa = action;
|
||||
|
||||
if (req_op(req) == REQ_OP_ZONE_RESET_ALL)
|
||||
|
@ -1276,7 +1276,7 @@ static ssize_t nvmet_subsys_attr_cntlid_min_store(struct config_item *item,
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&nvmet_config_sem);
|
||||
if (cntlid_min >= to_subsys(item)->cntlid_max)
|
||||
if (cntlid_min > to_subsys(item)->cntlid_max)
|
||||
goto out_unlock;
|
||||
to_subsys(item)->cntlid_min = cntlid_min;
|
||||
up_write(&nvmet_config_sem);
|
||||
@ -1306,7 +1306,7 @@ static ssize_t nvmet_subsys_attr_cntlid_max_store(struct config_item *item,
|
||||
return -EINVAL;
|
||||
|
||||
down_write(&nvmet_config_sem);
|
||||
if (cntlid_max <= to_subsys(item)->cntlid_min)
|
||||
if (cntlid_max < to_subsys(item)->cntlid_min)
|
||||
goto out_unlock;
|
||||
to_subsys(item)->cntlid_max = cntlid_max;
|
||||
up_write(&nvmet_config_sem);
|
||||
|
@ -1425,9 +1425,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
|
||||
if (!ctrl->sqs)
|
||||
goto out_free_changed_ns_list;
|
||||
|
||||
if (subsys->cntlid_min > subsys->cntlid_max)
|
||||
goto out_free_sqs;
|
||||
|
||||
ret = ida_alloc_range(&cntlid_ida,
|
||||
subsys->cntlid_min, subsys->cntlid_max,
|
||||
GFP_KERNEL);
|
||||
|
@ -602,7 +602,7 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys)
|
||||
goto out_put_file;
|
||||
}
|
||||
|
||||
old = xa_cmpxchg(&passthru_subsystems, ctrl->cntlid, NULL,
|
||||
old = xa_cmpxchg(&passthru_subsystems, ctrl->instance, NULL,
|
||||
subsys, GFP_KERNEL);
|
||||
if (xa_is_err(old)) {
|
||||
ret = xa_err(old);
|
||||
@ -635,7 +635,7 @@ out_unlock:
|
||||
static void __nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
|
||||
{
|
||||
if (subsys->passthru_ctrl) {
|
||||
xa_erase(&passthru_subsystems, subsys->passthru_ctrl->cntlid);
|
||||
xa_erase(&passthru_subsystems, subsys->passthru_ctrl->instance);
|
||||
module_put(subsys->passthru_ctrl->ops->module);
|
||||
nvme_put_ctrl(subsys->passthru_ctrl);
|
||||
}
|
||||
|
@ -339,7 +339,7 @@ struct sdebug_dev_info {
|
||||
bool used;
|
||||
|
||||
/* For ZBC devices */
|
||||
enum blk_zoned_model zmodel;
|
||||
bool zoned;
|
||||
unsigned int zcap;
|
||||
unsigned int zsize;
|
||||
unsigned int zsize_shift;
|
||||
@ -844,8 +844,11 @@ static bool write_since_sync;
|
||||
static bool sdebug_statistics = DEF_STATISTICS;
|
||||
static bool sdebug_wp;
|
||||
static bool sdebug_allow_restart;
|
||||
/* Following enum: 0: no zbc, def; 1: host aware; 2: host managed */
|
||||
static enum blk_zoned_model sdeb_zbc_model = BLK_ZONED_NONE;
|
||||
static enum {
|
||||
BLK_ZONED_NONE = 0,
|
||||
BLK_ZONED_HA = 1,
|
||||
BLK_ZONED_HM = 2,
|
||||
} sdeb_zbc_model = BLK_ZONED_NONE;
|
||||
static char *sdeb_zbc_model_s;
|
||||
|
||||
enum sam_lun_addr_method {SAM_LUN_AM_PERIPHERAL = 0x0,
|
||||
@ -1815,8 +1818,6 @@ static int inquiry_vpd_b1(struct sdebug_dev_info *devip, unsigned char *arr)
|
||||
arr[1] = 1; /* non rotating medium (e.g. solid state) */
|
||||
arr[2] = 0;
|
||||
arr[3] = 5; /* less than 1.8" */
|
||||
if (devip->zmodel == BLK_ZONED_HA)
|
||||
arr[4] = 1 << 4; /* zoned field = 01b */
|
||||
|
||||
return 0x3c;
|
||||
}
|
||||
@ -1883,7 +1884,7 @@ static int resp_inquiry(struct scsi_cmnd *scp, struct sdebug_dev_info *devip)
|
||||
if (! arr)
|
||||
return DID_REQUEUE << 16;
|
||||
is_disk = (sdebug_ptype == TYPE_DISK);
|
||||
is_zbc = (devip->zmodel != BLK_ZONED_NONE);
|
||||
is_zbc = devip->zoned;
|
||||
is_disk_zbc = (is_disk || is_zbc);
|
||||
have_wlun = scsi_is_wlun(scp->device->lun);
|
||||
if (have_wlun)
|
||||
@ -2195,7 +2196,7 @@ static int resp_readcap16(struct scsi_cmnd *scp,
|
||||
* Since the scsi_debug READ CAPACITY implementation always reports the
|
||||
* total disk capacity, set RC BASIS = 1 for host-managed ZBC devices.
|
||||
*/
|
||||
if (devip->zmodel == BLK_ZONED_HM)
|
||||
if (devip->zoned)
|
||||
arr[12] |= 1 << 4;
|
||||
|
||||
arr[15] = sdebug_lowest_aligned & 0xff;
|
||||
@ -2648,7 +2649,7 @@ static int resp_mode_sense(struct scsi_cmnd *scp,
|
||||
msense_6 = (MODE_SENSE == cmd[0]);
|
||||
llbaa = msense_6 ? false : !!(cmd[1] & 0x10);
|
||||
is_disk = (sdebug_ptype == TYPE_DISK);
|
||||
is_zbc = (devip->zmodel != BLK_ZONED_NONE);
|
||||
is_zbc = devip->zoned;
|
||||
if ((is_disk || is_zbc) && !dbd)
|
||||
bd_len = llbaa ? 16 : 8;
|
||||
else
|
||||
@ -3194,8 +3195,6 @@ static int check_zbc_access_params(struct scsi_cmnd *scp,
|
||||
struct sdeb_zone_state *zsp_end = zbc_zone(devip, lba + num - 1);
|
||||
|
||||
if (!write) {
|
||||
if (devip->zmodel == BLK_ZONED_HA)
|
||||
return 0;
|
||||
/* For host-managed, reads cannot cross zone types boundaries */
|
||||
if (zsp->z_type != zsp_end->z_type) {
|
||||
mk_sense_buffer(scp, ILLEGAL_REQUEST,
|
||||
@ -5322,7 +5321,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip)
|
||||
if (devip->zcap < devip->zsize)
|
||||
devip->nr_zones += devip->nr_seq_zones;
|
||||
|
||||
if (devip->zmodel == BLK_ZONED_HM) {
|
||||
if (devip->zoned) {
|
||||
/* zbc_max_open_zones can be 0, meaning "not reported" */
|
||||
if (sdeb_zbc_max_open >= devip->nr_zones - 1)
|
||||
devip->max_open = (devip->nr_zones - 1) / 2;
|
||||
@ -5347,7 +5346,7 @@ static int sdebug_device_create_zones(struct sdebug_dev_info *devip)
|
||||
zsp->z_size =
|
||||
min_t(u64, devip->zsize, capacity - zstart);
|
||||
} else if ((zstart & (devip->zsize - 1)) == 0) {
|
||||
if (devip->zmodel == BLK_ZONED_HM)
|
||||
if (devip->zoned)
|
||||
zsp->z_type = ZBC_ZTYPE_SWR;
|
||||
else
|
||||
zsp->z_type = ZBC_ZTYPE_SWP;
|
||||
@ -5390,13 +5389,13 @@ static struct sdebug_dev_info *sdebug_device_create(
|
||||
}
|
||||
devip->sdbg_host = sdbg_host;
|
||||
if (sdeb_zbc_in_use) {
|
||||
devip->zmodel = sdeb_zbc_model;
|
||||
devip->zoned = sdeb_zbc_model == BLK_ZONED_HM;
|
||||
if (sdebug_device_create_zones(devip)) {
|
||||
kfree(devip);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
devip->zmodel = BLK_ZONED_NONE;
|
||||
devip->zoned = false;
|
||||
}
|
||||
devip->create_ts = ktime_get_boottime();
|
||||
atomic_set(&devip->stopped, (sdeb_tur_ms_to_ready > 0 ? 2 : 0));
|
||||
|
@ -3117,7 +3117,6 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
|
||||
struct request_queue *q = sdkp->disk->queue;
|
||||
struct scsi_vpd *vpd;
|
||||
u16 rot;
|
||||
u8 zoned;
|
||||
|
||||
rcu_read_lock();
|
||||
vpd = rcu_dereference(sdkp->device->vpd_pgb1);
|
||||
@ -3128,7 +3127,7 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
|
||||
}
|
||||
|
||||
rot = get_unaligned_be16(&vpd->data[4]);
|
||||
zoned = (vpd->data[8] >> 4) & 3;
|
||||
sdkp->zoned = (vpd->data[8] >> 4) & 3;
|
||||
rcu_read_unlock();
|
||||
|
||||
if (rot == 1) {
|
||||
@ -3136,39 +3135,37 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q);
|
||||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED /* sd_probe rejects ZBD devices early otherwise */
|
||||
if (sdkp->device->type == TYPE_ZBC) {
|
||||
/*
|
||||
* Host-managed: Per ZBC and ZAC specifications, writes in
|
||||
* sequential write required zones of host-managed devices must
|
||||
* be aligned to the device physical block size.
|
||||
* Host-managed.
|
||||
*/
|
||||
disk_set_zoned(sdkp->disk);
|
||||
|
||||
/*
|
||||
* Per ZBC and ZAC specifications, writes in sequential write
|
||||
* required zones of host-managed devices must be aligned to
|
||||
* the device physical block size.
|
||||
*/
|
||||
disk_set_zoned(sdkp->disk, BLK_ZONED_HM);
|
||||
blk_queue_zone_write_granularity(q, sdkp->physical_block_size);
|
||||
} else {
|
||||
sdkp->zoned = zoned;
|
||||
if (sdkp->zoned == 1) {
|
||||
/* Host-aware */
|
||||
disk_set_zoned(sdkp->disk, BLK_ZONED_HA);
|
||||
} else {
|
||||
/* Regular disk or drive managed disk */
|
||||
disk_set_zoned(sdkp->disk, BLK_ZONED_NONE);
|
||||
}
|
||||
/*
|
||||
* Host-aware devices are treated as conventional.
|
||||
*/
|
||||
WARN_ON_ONCE(blk_queue_is_zoned(q));
|
||||
}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
|
||||
if (!sdkp->first_scan)
|
||||
return;
|
||||
|
||||
if (blk_queue_is_zoned(q)) {
|
||||
sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n",
|
||||
q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware");
|
||||
} else {
|
||||
if (sdkp->zoned == 1)
|
||||
sd_printk(KERN_NOTICE, sdkp,
|
||||
"Host-aware SMR disk used as regular disk\n");
|
||||
else if (sdkp->zoned == 2)
|
||||
sd_printk(KERN_NOTICE, sdkp,
|
||||
"Drive-managed SMR disk\n");
|
||||
}
|
||||
if (blk_queue_is_zoned(q))
|
||||
sd_printk(KERN_NOTICE, sdkp, "Host-managed zoned block device\n");
|
||||
else if (sdkp->zoned == 1)
|
||||
sd_printk(KERN_NOTICE, sdkp, "Host-aware SMR disk used as regular disk\n");
|
||||
else if (sdkp->zoned == 2)
|
||||
sd_printk(KERN_NOTICE, sdkp, "Drive-managed SMR disk\n");
|
||||
}
|
||||
|
||||
/**
|
||||
@ -3502,7 +3499,7 @@ static int sd_revalidate_disk(struct gendisk *disk)
|
||||
} else {
|
||||
q->limits.io_opt = 0;
|
||||
rw_max = min_not_zero(logical_to_sectors(sdp, dev_max),
|
||||
(sector_t)BLK_DEF_MAX_SECTORS);
|
||||
(sector_t)BLK_DEF_MAX_SECTORS_CAP);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -836,10 +836,7 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
|
||||
|
||||
/*
|
||||
* For all zoned disks, initialize zone append emulation data if not
|
||||
* already done. This is necessary also for host-aware disks used as
|
||||
* regular disks due to the presence of partitions as these partitions
|
||||
* may be deleted and the disk zoned model changed back from
|
||||
* BLK_ZONED_NONE to BLK_ZONED_HA.
|
||||
* already done.
|
||||
*/
|
||||
if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) {
|
||||
ret = sd_zbc_init_disk(sdkp);
|
||||
@ -932,17 +929,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
|
||||
sdkp->device->use_10_for_rw = 0;
|
||||
sdkp->device->use_16_for_sync = 1;
|
||||
|
||||
if (!blk_queue_is_zoned(q)) {
|
||||
/*
|
||||
* This can happen for a host aware disk with partitions.
|
||||
* The block device zone model was already cleared by
|
||||
* disk_set_zoned(). Only free the scsi disk zone
|
||||
* information and exit early.
|
||||
*/
|
||||
sd_zbc_free_zone_info(sdkp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Check zoned block device characteristics (unconstrained reads) */
|
||||
ret = sd_zbc_check_zoned_characteristics(sdkp, buf);
|
||||
if (ret)
|
||||
|
@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
|
||||
|
||||
kvfree(zones);
|
||||
|
||||
switch (bdev_zoned_model(bdev)) {
|
||||
case BLK_ZONED_HM:
|
||||
if (bdev_is_zoned(bdev)) {
|
||||
model = "host-managed zoned";
|
||||
emulated = "";
|
||||
break;
|
||||
case BLK_ZONED_HA:
|
||||
model = "host-aware zoned";
|
||||
emulated = "";
|
||||
break;
|
||||
case BLK_ZONED_NONE:
|
||||
} else {
|
||||
model = "regular";
|
||||
emulated = "emulated ";
|
||||
break;
|
||||
default:
|
||||
/* Just in case */
|
||||
btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
|
||||
bdev_zoned_model(bdev),
|
||||
rcu_str_deref(device->name));
|
||||
ret = -EOPNOTSUPP;
|
||||
goto out_free_zone_info;
|
||||
}
|
||||
|
||||
btrfs_info_in_rcu(fs_info,
|
||||
@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
|
||||
|
||||
out:
|
||||
kvfree(zones);
|
||||
out_free_zone_info:
|
||||
btrfs_destroy_dev_zone_info(device);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
|
||||
struct btrfs_device *device;
|
||||
|
||||
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
|
||||
if (device->bdev &&
|
||||
bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
|
||||
if (device->bdev && bdev_is_zoned(device->bdev)) {
|
||||
btrfs_err(fs_info,
|
||||
"zoned: mode not enabled but zoned device found: %pg",
|
||||
device->bdev);
|
||||
|
@ -320,7 +320,7 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
|
||||
}
|
||||
|
||||
/* Do not allow Host Managed zoned device. */
|
||||
return bdev_zoned_model(bdev) != BLK_ZONED_HM;
|
||||
return !bdev_is_zoned(bdev);
|
||||
}
|
||||
|
||||
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
|
||||
|
@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
|
||||
}
|
||||
blkaddr -= FDEV(devi).start_blk;
|
||||
}
|
||||
return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM &&
|
||||
return bdev_is_zoned(FDEV(devi).bdev) &&
|
||||
f2fs_blkz_is_seq(sbi, devi, blkaddr) &&
|
||||
(blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1);
|
||||
}
|
||||
|
@ -4279,24 +4279,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
|
||||
sbi->aligned_blksize = false;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
|
||||
!f2fs_sb_has_blkzoned(sbi)) {
|
||||
f2fs_err(sbi, "Zoned block device feature not enabled");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
|
||||
if (bdev_is_zoned(FDEV(i).bdev)) {
|
||||
if (!f2fs_sb_has_blkzoned(sbi)) {
|
||||
f2fs_err(sbi, "Zoned block device feature not enabled");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (init_blkz_info(sbi, i)) {
|
||||
f2fs_err(sbi, "Failed to initialize F2FS blkzone information");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (max_devices == 1)
|
||||
break;
|
||||
f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
|
||||
f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)",
|
||||
i, FDEV(i).path,
|
||||
FDEV(i).total_segments,
|
||||
FDEV(i).start_blk, FDEV(i).end_blk,
|
||||
bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
|
||||
"Host-aware" : "Host-managed");
|
||||
FDEV(i).start_blk, FDEV(i).end_blk);
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
@ -324,6 +324,8 @@ enum bip_flags {
|
||||
BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */
|
||||
BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */
|
||||
BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */
|
||||
BIP_INTEGRITY_USER = 1 << 5, /* Integrity payload is user address */
|
||||
BIP_COPY_USER = 1 << 6, /* Kernel bounce buffer in use */
|
||||
};
|
||||
|
||||
/*
|
||||
@ -718,6 +720,7 @@ static inline bool bioset_initialized(struct bio_set *bs)
|
||||
for_each_bio(_bio) \
|
||||
bip_for_each_vec(_bvl, _bio->bi_integrity, _iter)
|
||||
|
||||
int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len, u32 seed);
|
||||
extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
|
||||
extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
|
||||
extern bool bio_integrity_prep(struct bio *);
|
||||
@ -789,6 +792,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
|
||||
ssize_t len, u32 seed)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
|
||||
/*
|
||||
|
@ -830,6 +830,12 @@ void blk_mq_end_request_batch(struct io_comp_batch *ib);
|
||||
*/
|
||||
static inline bool blk_mq_need_time_stamp(struct request *rq)
|
||||
{
|
||||
/*
|
||||
* passthrough io doesn't use iostat accounting, cgroup stats
|
||||
* and io scheduler functionalities.
|
||||
*/
|
||||
if (blk_rq_is_passthrough(rq))
|
||||
return false;
|
||||
return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
|
||||
}
|
||||
|
||||
|
@ -378,6 +378,8 @@ enum req_op {
|
||||
REQ_OP_DISCARD = (__force blk_opf_t)3,
|
||||
/* securely erase sectors */
|
||||
REQ_OP_SECURE_ERASE = (__force blk_opf_t)5,
|
||||
/* write data at the current zone write pointer */
|
||||
REQ_OP_ZONE_APPEND = (__force blk_opf_t)7,
|
||||
/* write the zero filled sector many times */
|
||||
REQ_OP_WRITE_ZEROES = (__force blk_opf_t)9,
|
||||
/* Open a zone */
|
||||
@ -386,12 +388,10 @@ enum req_op {
|
||||
REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11,
|
||||
/* Transition a zone to full */
|
||||
REQ_OP_ZONE_FINISH = (__force blk_opf_t)12,
|
||||
/* write data at the current zone write pointer */
|
||||
REQ_OP_ZONE_APPEND = (__force blk_opf_t)13,
|
||||
/* reset a zone write pointer */
|
||||
REQ_OP_ZONE_RESET = (__force blk_opf_t)15,
|
||||
REQ_OP_ZONE_RESET = (__force blk_opf_t)13,
|
||||
/* reset all the zone present on the device */
|
||||
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17,
|
||||
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15,
|
||||
|
||||
/* Driver private requests */
|
||||
REQ_OP_DRV_IN = (__force blk_opf_t)34,
|
||||
|
@ -265,18 +265,6 @@ static inline bool blk_op_is_passthrough(blk_opf_t op)
|
||||
return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT;
|
||||
}
|
||||
|
||||
/*
|
||||
* Zoned block device models (zoned limit).
|
||||
*
|
||||
* Note: This needs to be ordered from the least to the most severe
|
||||
* restrictions for the inheritance in blk_stack_limits() to work.
|
||||
*/
|
||||
enum blk_zoned_model {
|
||||
BLK_ZONED_NONE = 0, /* Regular block device */
|
||||
BLK_ZONED_HA, /* Host-aware zoned block device */
|
||||
BLK_ZONED_HM, /* Host-managed zoned block device */
|
||||
};
|
||||
|
||||
/*
|
||||
* BLK_BOUNCE_NONE: never bounce (default)
|
||||
* BLK_BOUNCE_HIGH: bounce all highmem pages
|
||||
@ -318,7 +306,7 @@ struct queue_limits {
|
||||
unsigned char misaligned;
|
||||
unsigned char discard_misaligned;
|
||||
unsigned char raid_partial_stripes_expensive;
|
||||
enum blk_zoned_model zoned;
|
||||
bool zoned;
|
||||
|
||||
/*
|
||||
* Drivers that set dma_alignment to less than 511 must be prepared to
|
||||
@ -331,24 +319,15 @@ struct queue_limits {
|
||||
typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
|
||||
void *data);
|
||||
|
||||
void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model);
|
||||
void disk_set_zoned(struct gendisk *disk);
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
#define BLK_ALL_ZONES ((unsigned int)-1)
|
||||
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data);
|
||||
unsigned int bdev_nr_zones(struct block_device *bdev);
|
||||
extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
|
||||
sector_t sectors, sector_t nr_sectors,
|
||||
gfp_t gfp_mask);
|
||||
unsigned int nr_zones, report_zones_cb cb, void *data);
|
||||
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
|
||||
sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask);
|
||||
int blk_revalidate_disk_zones(struct gendisk *disk,
|
||||
void (*update_driver_data)(struct gendisk *disk));
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_BLK_DEV_ZONED */
|
||||
void (*update_driver_data)(struct gendisk *disk));
|
||||
|
||||
/*
|
||||
* Independent access ranges: struct blk_independent_access_range describes
|
||||
@ -378,59 +357,51 @@ struct blk_independent_access_ranges {
|
||||
};
|
||||
|
||||
struct request_queue {
|
||||
struct request *last_merge;
|
||||
struct elevator_queue *elevator;
|
||||
|
||||
struct percpu_ref q_usage_counter;
|
||||
|
||||
struct blk_queue_stats *stats;
|
||||
struct rq_qos *rq_qos;
|
||||
struct mutex rq_qos_mutex;
|
||||
|
||||
const struct blk_mq_ops *mq_ops;
|
||||
|
||||
/* sw queues */
|
||||
struct blk_mq_ctx __percpu *queue_ctx;
|
||||
|
||||
unsigned int queue_depth;
|
||||
|
||||
/* hw dispatch queues */
|
||||
struct xarray hctx_table;
|
||||
unsigned int nr_hw_queues;
|
||||
|
||||
/*
|
||||
* The queue owner gets to use this for whatever they like.
|
||||
* ll_rw_blk doesn't touch it.
|
||||
*/
|
||||
void *queuedata;
|
||||
|
||||
struct elevator_queue *elevator;
|
||||
|
||||
const struct blk_mq_ops *mq_ops;
|
||||
|
||||
/* sw queues */
|
||||
struct blk_mq_ctx __percpu *queue_ctx;
|
||||
|
||||
/*
|
||||
* various queue flags, see QUEUE_* below
|
||||
*/
|
||||
unsigned long queue_flags;
|
||||
/*
|
||||
* Number of contexts that have called blk_set_pm_only(). If this
|
||||
* counter is above zero then only RQF_PM requests are processed.
|
||||
*/
|
||||
atomic_t pm_only;
|
||||
|
||||
/*
|
||||
* ida allocated id for this queue. Used to index queues from
|
||||
* ioctx.
|
||||
*/
|
||||
int id;
|
||||
unsigned int rq_timeout;
|
||||
|
||||
unsigned int queue_depth;
|
||||
|
||||
refcount_t refs;
|
||||
|
||||
/* hw dispatch queues */
|
||||
unsigned int nr_hw_queues;
|
||||
struct xarray hctx_table;
|
||||
|
||||
struct percpu_ref q_usage_counter;
|
||||
|
||||
struct request *last_merge;
|
||||
|
||||
spinlock_t queue_lock;
|
||||
|
||||
struct gendisk *disk;
|
||||
int quiesce_depth;
|
||||
|
||||
refcount_t refs;
|
||||
struct gendisk *disk;
|
||||
|
||||
/*
|
||||
* mq queue kobject
|
||||
*/
|
||||
struct kobject *mq_kobj;
|
||||
|
||||
struct queue_limits limits;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
struct blk_integrity integrity;
|
||||
#endif /* CONFIG_BLK_DEV_INTEGRITY */
|
||||
@ -440,25 +411,41 @@ struct request_queue {
|
||||
enum rpm_status rpm_status;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Number of contexts that have called blk_set_pm_only(). If this
|
||||
* counter is above zero then only RQF_PM requests are processed.
|
||||
*/
|
||||
atomic_t pm_only;
|
||||
|
||||
struct blk_queue_stats *stats;
|
||||
struct rq_qos *rq_qos;
|
||||
struct mutex rq_qos_mutex;
|
||||
|
||||
/*
|
||||
* ida allocated id for this queue. Used to index queues from
|
||||
* ioctx.
|
||||
*/
|
||||
int id;
|
||||
|
||||
unsigned int dma_pad_mask;
|
||||
|
||||
/*
|
||||
* queue settings
|
||||
*/
|
||||
unsigned long nr_requests; /* Max # of requests */
|
||||
|
||||
unsigned int dma_pad_mask;
|
||||
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
|
||||
struct blk_crypto_profile *crypto_profile;
|
||||
struct kobject *crypto_kobject;
|
||||
#endif
|
||||
|
||||
unsigned int rq_timeout;
|
||||
|
||||
struct timer_list timeout;
|
||||
struct work_struct timeout_work;
|
||||
|
||||
atomic_t nr_active_requests_shared_tags;
|
||||
|
||||
unsigned int required_elevator_features;
|
||||
|
||||
struct blk_mq_tags *sched_shared_tags;
|
||||
|
||||
struct list_head icq_list;
|
||||
@ -469,11 +456,12 @@ struct request_queue {
|
||||
struct mutex blkcg_mutex;
|
||||
#endif
|
||||
|
||||
struct queue_limits limits;
|
||||
|
||||
unsigned int required_elevator_features;
|
||||
|
||||
int node;
|
||||
|
||||
spinlock_t requeue_lock;
|
||||
struct list_head requeue_list;
|
||||
struct delayed_work requeue_work;
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
||||
struct blk_trace __rcu *blk_trace;
|
||||
#endif
|
||||
@ -483,10 +471,6 @@ struct request_queue {
|
||||
struct blk_flush_queue *fq;
|
||||
struct list_head flush_list;
|
||||
|
||||
struct list_head requeue_list;
|
||||
spinlock_t requeue_lock;
|
||||
struct delayed_work requeue_work;
|
||||
|
||||
struct mutex sysfs_lock;
|
||||
struct mutex sysfs_dir_lock;
|
||||
|
||||
@ -511,8 +495,6 @@ struct request_queue {
|
||||
*/
|
||||
struct mutex mq_freeze_lock;
|
||||
|
||||
int quiesce_depth;
|
||||
|
||||
struct blk_mq_tag_set *tag_set;
|
||||
struct list_head tag_set_list;
|
||||
|
||||
@ -625,26 +607,14 @@ static inline enum rpm_status queue_rpm_status(struct request_queue *q)
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline enum blk_zoned_model
|
||||
blk_queue_zoned_model(struct request_queue *q)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED))
|
||||
return q->limits.zoned;
|
||||
return BLK_ZONED_NONE;
|
||||
}
|
||||
|
||||
static inline bool blk_queue_is_zoned(struct request_queue *q)
|
||||
{
|
||||
switch (blk_queue_zoned_model(q)) {
|
||||
case BLK_ZONED_HA:
|
||||
case BLK_ZONED_HM:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
return IS_ENABLED(CONFIG_BLK_DEV_ZONED) && q->limits.zoned;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_ZONED
|
||||
unsigned int bdev_nr_zones(struct block_device *bdev);
|
||||
|
||||
static inline unsigned int disk_nr_zones(struct gendisk *disk)
|
||||
{
|
||||
return blk_queue_is_zoned(disk->queue) ? disk->nr_zones : 0;
|
||||
@ -689,6 +659,11 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
|
||||
}
|
||||
|
||||
#else /* CONFIG_BLK_DEV_ZONED */
|
||||
static inline unsigned int bdev_nr_zones(struct block_device *bdev)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned int disk_nr_zones(struct gendisk *disk)
|
||||
{
|
||||
return 0;
|
||||
@ -1082,7 +1057,14 @@ enum blk_default_limits {
|
||||
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
|
||||
};
|
||||
|
||||
#define BLK_DEF_MAX_SECTORS 2560u
|
||||
/*
|
||||
* Default upper limit for the software max_sectors limit used for
|
||||
* regular file system I/O. This can be increased through sysfs.
|
||||
*
|
||||
* Not to be confused with the max_hw_sector limit that is entirely
|
||||
* controlled by the driver, usually based on hardware limits.
|
||||
*/
|
||||
#define BLK_DEF_MAX_SECTORS_CAP 2560u
|
||||
|
||||
static inline unsigned long queue_segment_boundary(const struct request_queue *q)
|
||||
{
|
||||
@ -1261,11 +1243,6 @@ static inline bool bdev_nowait(struct block_device *bdev)
|
||||
return test_bit(QUEUE_FLAG_NOWAIT, &bdev_get_queue(bdev)->queue_flags);
|
||||
}
|
||||
|
||||
static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
|
||||
{
|
||||
return blk_queue_zoned_model(bdev_get_queue(bdev));
|
||||
}
|
||||
|
||||
static inline bool bdev_is_zoned(struct block_device *bdev)
|
||||
{
|
||||
return blk_queue_is_zoned(bdev_get_queue(bdev));
|
||||
|
@ -28,17 +28,12 @@ enum io_uring_cmd_flags {
|
||||
|
||||
/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */
|
||||
#define IORING_URING_CMD_CANCELABLE (1U << 30)
|
||||
#define IORING_URING_CMD_POLLED (1U << 31)
|
||||
|
||||
struct io_uring_cmd {
|
||||
struct file *file;
|
||||
const struct io_uring_sqe *sqe;
|
||||
union {
|
||||
/* callback to defer completions to task context */
|
||||
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
|
||||
/* used for polled completion */
|
||||
void *cookie;
|
||||
};
|
||||
/* callback to defer completions to task context */
|
||||
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
|
||||
u32 cmd_op;
|
||||
u32 flags;
|
||||
u8 pdu[32]; /* available inline for free use */
|
||||
|
@ -2,15 +2,11 @@
|
||||
/*
|
||||
md_p.h : physical layout of Linux RAID devices
|
||||
Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
|
||||
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
(for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#ifndef _MD_P_H
|
||||
@ -237,7 +233,7 @@ struct mdp_superblock_1 {
|
||||
char set_name[32]; /* set and interpreted by user-space */
|
||||
|
||||
__le64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
|
||||
__le32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
|
||||
__le32 level; /* 0,1,4,5 */
|
||||
__le32 layout; /* only for raid5 and raid10 currently */
|
||||
__le64 size; /* used size of component devices, in 512byte sectors */
|
||||
|
||||
|
@ -2,15 +2,11 @@
|
||||
/*
|
||||
md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
|
||||
Copyright (C) 1998 Ingo Molnar
|
||||
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2, or (at your option)
|
||||
any later version.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
(for example /usr/src/linux/COPYING); if not, write to the Free
|
||||
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#ifndef _UAPI_MD_U_H
|
||||
@ -107,11 +103,6 @@ typedef struct mdu_array_info_s {
|
||||
|
||||
} mdu_array_info_t;
|
||||
|
||||
/* non-obvious values for 'level' */
|
||||
#define LEVEL_MULTIPATH (-4)
|
||||
#define LEVEL_LINEAR (-1)
|
||||
#define LEVEL_FAULTY (-5)
|
||||
|
||||
/* we need a value for 'no level specified' and 0
|
||||
* means 'raid0', so we need something else. This is
|
||||
* for internal use only
|
||||
|
@ -182,7 +182,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
|
||||
return -EOPNOTSUPP;
|
||||
issue_flags |= IO_URING_F_IOPOLL;
|
||||
req->iopoll_completed = 0;
|
||||
WRITE_ONCE(ioucmd->cookie, NULL);
|
||||
}
|
||||
|
||||
ret = file->f_op->uring_cmd(ioucmd, issue_flags);
|
||||
|
Loading…
Reference in New Issue
Block a user