Merge tag 'md-next-20230814-resend' into loongarch-next

LoongArch architecture changes for 6.5 (raid5/6 optimization) depend on
the md changes to fix build and work, so merge them to create a base.
This commit is contained in:
Huacai Chen 2023-08-30 17:35:54 +08:00
commit 9d1785590b
95 changed files with 1310 additions and 766 deletions

View File

@ -5,6 +5,7 @@
menuconfig BLOCK menuconfig BLOCK
bool "Enable the block layer" if EXPERT bool "Enable the block layer" if EXPERT
default y default y
select FS_IOMAP
select SBITMAP select SBITMAP
help help
Provide block layer support for the kernel. Provide block layer support for the kernel.

View File

@ -123,20 +123,38 @@ void bio_integrity_free(struct bio *bio)
int bio_integrity_add_page(struct bio *bio, struct page *page, int bio_integrity_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset) unsigned int len, unsigned int offset)
{ {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip = bio_integrity(bio);
if (bip->bip_vcnt >= bip->bip_max_vcnt) { if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
printk(KERN_ERR "%s: bip_vec full\n", __func__); queue_max_hw_sectors(q))
return 0; return 0;
}
if (bip->bip_vcnt && if (bip->bip_vcnt > 0) {
bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
&bip->bip_vec[bip->bip_vcnt - 1], offset)) bool same_page = false;
return 0;
if (bvec_try_merge_hw_page(q, bv, page, len, offset,
&same_page)) {
bip->bip_iter.bi_size += len;
return len;
}
if (bip->bip_vcnt >=
min(bip->bip_max_vcnt, queue_max_integrity_segments(q)))
return 0;
/*
* If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it.
*/
if (bvec_gap_to_prev(&q->limits, bv, offset))
return 0;
}
bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
bip->bip_vcnt++; bip->bip_vcnt++;
bip->bip_iter.bi_size += len;
return len; return len;
} }
@ -199,8 +217,6 @@ bool bio_integrity_prep(struct bio *bio)
unsigned long start, end; unsigned long start, end;
unsigned int len, nr_pages; unsigned int len, nr_pages;
unsigned int bytes, offset, i; unsigned int bytes, offset, i;
unsigned int intervals;
blk_status_t status;
if (!bi) if (!bi)
return true; return true;
@ -224,12 +240,10 @@ bool bio_integrity_prep(struct bio *bio)
!(bi->flags & BLK_INTEGRITY_GENERATE)) !(bi->flags & BLK_INTEGRITY_GENERATE))
return true; return true;
} }
intervals = bio_integrity_intervals(bi, bio_sectors(bio));
/* Allocate kernel buffer for protection data */ /* Allocate kernel buffer for protection data */
len = intervals * bi->tuple_size; len = bio_integrity_bytes(bi, bio_sectors(bio));
buf = kmalloc(len, GFP_NOIO); buf = kmalloc(len, GFP_NOIO);
status = BLK_STS_RESOURCE;
if (unlikely(buf == NULL)) { if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n"); printk(KERN_ERR "could not allocate integrity buffer\n");
goto err_end_io; goto err_end_io;
@ -244,12 +258,10 @@ bool bio_integrity_prep(struct bio *bio)
if (IS_ERR(bip)) { if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n"); printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf); kfree(buf);
status = BLK_STS_RESOURCE;
goto err_end_io; goto err_end_io;
} }
bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip->bip_flags |= BIP_BLOCK_INTEGRITY;
bip->bip_iter.bi_size = len;
bip_set_seed(bip, bio->bi_iter.bi_sector); bip_set_seed(bip, bio->bi_iter.bi_sector);
if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
@ -257,28 +269,18 @@ bool bio_integrity_prep(struct bio *bio)
/* Map it */ /* Map it */
offset = offset_in_page(buf); offset = offset_in_page(buf);
for (i = 0 ; i < nr_pages ; i++) { for (i = 0; i < nr_pages && len > 0; i++) {
int ret;
bytes = PAGE_SIZE - offset; bytes = PAGE_SIZE - offset;
if (len <= 0)
break;
if (bytes > len) if (bytes > len)
bytes = len; bytes = len;
ret = bio_integrity_add_page(bio, virt_to_page(buf), if (bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset); bytes, offset) < bytes) {
if (ret == 0) {
printk(KERN_ERR "could not attach integrity payload\n"); printk(KERN_ERR "could not attach integrity payload\n");
status = BLK_STS_RESOURCE;
goto err_end_io; goto err_end_io;
} }
if (ret < bytes)
break;
buf += bytes; buf += bytes;
len -= bytes; len -= bytes;
offset = 0; offset = 0;
@ -294,10 +296,9 @@ bool bio_integrity_prep(struct bio *bio)
return true; return true;
err_end_io: err_end_io:
bio->bi_status = status; bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio); bio_endio(bio);
return false; return false;
} }
EXPORT_SYMBOL(bio_integrity_prep); EXPORT_SYMBOL(bio_integrity_prep);

View File

@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
} }
EXPORT_SYMBOL(bio_kmalloc); EXPORT_SYMBOL(bio_kmalloc);
void zero_fill_bio(struct bio *bio) void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{ {
struct bio_vec bv; struct bio_vec bv;
struct bvec_iter iter; struct bvec_iter iter;
bio_for_each_segment(bv, bio, iter) __bio_for_each_segment(bv, bio, iter, start)
memzero_bvec(&bv); memzero_bvec(&bv);
} }
EXPORT_SYMBOL(zero_fill_bio); EXPORT_SYMBOL(zero_fill_bio_iter);
/** /**
* bio_truncate - truncate the bio to small size of @new_size * bio_truncate - truncate the bio to small size of @new_size
@ -903,9 +903,8 @@ static inline bool bio_full(struct bio *bio, unsigned len)
return false; return false;
} }
static inline bool page_is_mergeable(const struct bio_vec *bv, static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
struct page *page, unsigned int len, unsigned int off, unsigned int len, unsigned int off, bool *same_page)
bool *same_page)
{ {
size_t bv_end = bv->bv_offset + bv->bv_len; size_t bv_end = bv->bv_offset + bv->bv_len;
phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
@ -919,49 +918,15 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false; return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr); *same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (*same_page) if (!*same_page) {
return true; if (IS_ENABLED(CONFIG_KMSAN))
else if (IS_ENABLED(CONFIG_KMSAN)) return false;
return false; if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); return false;
}
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
static bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
} }
return false;
bv->bv_len += len;
return true;
} }
/* /*
@ -969,11 +934,10 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page,
* size limit. This is not for normal read/write bios, but for passthrough * size limit. This is not for normal read/write bios, but for passthrough
* or Zone Append operations that we can't split. * or Zone Append operations that we can't split.
*/ */
static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, struct page *page, unsigned len, unsigned offset,
unsigned offset, bool *same_page) bool *same_page)
{ {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q); unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
@ -982,7 +946,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
return false; return false;
if (bv->bv_len + len > queue_max_segment_size(q)) if (bv->bv_len + len > queue_max_segment_size(q))
return false; return false;
return __bio_try_merge_page(bio, page, len, offset, same_page); return bvec_try_merge_page(bv, page, len, offset, same_page);
} }
/** /**
@ -1002,33 +966,33 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset, struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page) unsigned int max_sectors, bool *same_page)
{ {
struct bio_vec *bvec;
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0; return 0;
if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
return 0; return 0;
if (bio->bi_vcnt > 0) { if (bio->bi_vcnt > 0) {
if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_try_merge_hw_page(q, bv, page, len, offset,
same_page)) {
bio->bi_iter.bi_size += len;
return len; return len;
}
if (bio->bi_vcnt >=
min(bio->bi_max_vecs, queue_max_segments(q)))
return 0;
/* /*
* If the queue doesn't support SG gaps and adding this segment * If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it. * would create a gap, disallow it.
*/ */
bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; if (bvec_gap_to_prev(&q->limits, bv, offset))
if (bvec_gap_to_prev(&q->limits, bvec, offset))
return 0; return 0;
} }
if (bio_full(bio, len))
return 0;
if (bio->bi_vcnt >= queue_max_segments(q))
return 0;
bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
bio->bi_vcnt++; bio->bi_vcnt++;
bio->bi_iter.bi_size += len; bio->bi_iter.bi_size += len;
@ -1129,11 +1093,21 @@ int bio_add_page(struct bio *bio, struct page *page,
{ {
bool same_page = false; bool same_page = false;
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
if (bio_full(bio, len)) return 0;
return 0; if (bio->bi_iter.bi_size > UINT_MAX - len)
__bio_add_page(bio, page, len, offset); return 0;
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
page, len, offset, &same_page)) {
bio->bi_iter.bi_size += len;
return len;
} }
if (bio->bi_vcnt >= bio->bi_max_vecs)
return 0;
__bio_add_page(bio, page, len, offset);
return len; return len;
} }
EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_page);
@ -1207,13 +1181,18 @@ static int bio_iov_add_page(struct bio *bio, struct page *page,
{ {
bool same_page = false; bool same_page = false;
if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
__bio_add_page(bio, page, len, offset); return -EIO;
if (bio->bi_vcnt > 0 &&
bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
page, len, offset, &same_page)) {
bio->bi_iter.bi_size += len;
if (same_page)
bio_release_page(bio, page);
return 0; return 0;
} }
__bio_add_page(bio, page, len, offset);
if (same_page)
bio_release_page(bio, page);
return 0; return 0;
} }
@ -1252,7 +1231,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
struct page **pages = (struct page **)bv; struct page **pages = (struct page **)bv;
ssize_t size, left; ssize_t size, left;
unsigned len, i = 0; unsigned len, i = 0;
size_t offset, trim; size_t offset;
int ret = 0; int ret = 0;
/* /*
@ -1281,10 +1260,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); if (bio->bi_bdev) {
iov_iter_revert(iter, trim); size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
iov_iter_revert(iter, trim);
size -= trim;
}
size -= trim;
if (unlikely(!size)) { if (unlikely(!size)) {
ret = -EFAULT; ret = -EFAULT;
goto out; goto out;
@ -1337,6 +1318,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{ {
int ret = 0; int ret = 0;
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return -EIO;
if (iov_iter_is_bvec(iter)) { if (iov_iter_is_bvec(iter)) {
bio_iov_bvec_set(bio, iter); bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, bio->bi_iter.bi_size); iov_iter_advance(iter, bio->bi_iter.bi_size);
@ -1490,6 +1474,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(bvec->bv_page); set_page_dirty_lock(bvec->bv_page);
} }
} }
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
/* /*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty. * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@ -1549,6 +1534,7 @@ defer:
spin_unlock_irqrestore(&bio_dirty_lock, flags); spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work); schedule_work(&bio_dirty_work);
} }
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
static inline bool bio_remaining_done(struct bio *bio) static inline bool bio_remaining_done(struct bio *bio)
{ {

View File

@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status)
return "<null>"; return "<null>";
return blk_errors[idx].name; return blk_errors[idx].name;
} }
EXPORT_SYMBOL_GPL(blk_status_to_str);
/** /**
* blk_sync_queue - cancel any pending callbacks on a queue * blk_sync_queue - cancel any pending callbacks on a queue

View File

@ -183,13 +183,13 @@ static void blk_flush_complete_seq(struct request *rq,
/* queue for flush */ /* queue for flush */
if (list_empty(pending)) if (list_empty(pending))
fq->flush_pending_since = jiffies; fq->flush_pending_since = jiffies;
list_move_tail(&rq->flush.list, pending); list_move_tail(&rq->queuelist, pending);
break; break;
case REQ_FSEQ_DATA: case REQ_FSEQ_DATA:
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); fq->flush_data_in_flight++;
spin_lock(&q->requeue_lock); spin_lock(&q->requeue_lock);
list_add(&rq->queuelist, &q->requeue_list); list_move(&rq->queuelist, &q->requeue_list);
spin_unlock(&q->requeue_lock); spin_unlock(&q->requeue_lock);
blk_mq_kick_requeue_list(q); blk_mq_kick_requeue_list(q);
break; break;
@ -201,7 +201,7 @@ static void blk_flush_complete_seq(struct request *rq,
* flush data request completion path. Restore @rq for * flush data request completion path. Restore @rq for
* normal completion and end it. * normal completion and end it.
*/ */
list_del_init(&rq->flush.list); list_del_init(&rq->queuelist);
blk_flush_restore_request(rq); blk_flush_restore_request(rq);
blk_mq_end_request(rq, error); blk_mq_end_request(rq, error);
break; break;
@ -257,7 +257,7 @@ static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
fq->flush_running_idx ^= 1; fq->flush_running_idx ^= 1;
/* and push the waiting requests to the next stage */ /* and push the waiting requests to the next stage */
list_for_each_entry_safe(rq, n, running, flush.list) { list_for_each_entry_safe(rq, n, running, queuelist) {
unsigned int seq = blk_flush_cur_seq(rq); unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
@ -291,7 +291,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
{ {
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq = struct request *first_rq =
list_first_entry(pending, struct request, flush.list); list_first_entry(pending, struct request, queuelist);
struct request *flush_rq = fq->flush_rq; struct request *flush_rq = fq->flush_rq;
/* C1 described at the top of this file */ /* C1 described at the top of this file */
@ -299,7 +299,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
return; return;
/* C2 and C3 */ /* C2 and C3 */
if (!list_empty(&fq->flush_data_in_flight) && if (fq->flush_data_in_flight &&
time_before(jiffies, time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return; return;
@ -374,6 +374,12 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
* the comment in flush_end_io(). * the comment in flush_end_io().
*/ */
spin_lock_irqsave(&fq->mq_flush_lock, flags); spin_lock_irqsave(&fq->mq_flush_lock, flags);
fq->flush_data_in_flight--;
/*
* May have been corrupted by rq->rq_next reuse, we need to
* re-initialize rq->queuelist before reusing it here.
*/
INIT_LIST_HEAD(&rq->queuelist);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags); spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
@ -384,7 +390,6 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
static void blk_rq_init_flush(struct request *rq) static void blk_rq_init_flush(struct request *rq)
{ {
rq->flush.seq = 0; rq->flush.seq = 0;
INIT_LIST_HEAD(&rq->flush.list);
rq->rq_flags |= RQF_FLUSH_SEQ; rq->rq_flags |= RQF_FLUSH_SEQ;
rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
rq->end_io = mq_flush_data_end_io; rq->end_io = mq_flush_data_end_io;
@ -443,9 +448,9 @@ bool blk_insert_flush(struct request *rq)
* the post flush, and then just pass the command on. * the post flush, and then just pass the command on.
*/ */
blk_rq_init_flush(rq); blk_rq_init_flush(rq);
rq->flush.seq |= REQ_FSEQ_POSTFLUSH; rq->flush.seq |= REQ_FSEQ_PREFLUSH;
spin_lock_irq(&fq->mq_flush_lock); spin_lock_irq(&fq->mq_flush_lock);
list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); fq->flush_data_in_flight++;
spin_unlock_irq(&fq->mq_flush_lock); spin_unlock_irq(&fq->mq_flush_lock);
return false; return false;
default: default:
@ -496,7 +501,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
return fq; return fq;

View File

@ -824,29 +824,6 @@ static void iolatency_clear_scaling(struct blkcg_gq *blkg)
} }
} }
static int blk_iolatency_try_init(struct blkg_conf_ctx *ctx)
{
static DEFINE_MUTEX(init_mutex);
int ret;
ret = blkg_conf_open_bdev(ctx);
if (ret)
return ret;
/*
* blk_iolatency_init() may fail after rq_qos_add() succeeds which can
* confuse iolat_rq_qos() test. Make the test and init atomic.
*/
mutex_lock(&init_mutex);
if (!iolat_rq_qos(ctx->bdev->bd_queue))
ret = blk_iolatency_init(ctx->bdev->bd_disk);
mutex_unlock(&init_mutex);
return ret;
}
static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off) size_t nbytes, loff_t off)
{ {
@ -861,7 +838,17 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg_conf_init(&ctx, buf); blkg_conf_init(&ctx, buf);
ret = blk_iolatency_try_init(&ctx); ret = blkg_conf_open_bdev(&ctx);
if (ret)
goto out;
/*
* blk_iolatency_init() may fail after rq_qos_add() succeeds which can
* confuse iolat_rq_qos() test. Make the test and init atomic.
*/
lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex);
if (!iolat_rq_qos(ctx.bdev->bd_queue))
ret = blk_iolatency_init(ctx.bdev->bd_disk);
if (ret) if (ret)
goto out; goto out;

View File

@ -43,6 +43,7 @@
#include "blk-ioprio.h" #include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq, static void blk_mq_request_bypass_insert(struct request *rq,
@ -1174,15 +1175,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
static void blk_mq_complete_send_ipi(struct request *rq) static void blk_mq_complete_send_ipi(struct request *rq)
{ {
struct llist_head *list;
unsigned int cpu; unsigned int cpu;
cpu = rq->mq_ctx->cpu; cpu = rq->mq_ctx->cpu;
list = &per_cpu(blk_cpu_done, cpu); if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
if (llist_add(&rq->ipi_list, list)) { smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq);
smp_call_function_single_async(cpu, &rq->csd);
}
} }
static void blk_mq_raise_softirq(struct request *rq) static void blk_mq_raise_softirq(struct request *rq)
@ -1343,7 +1340,7 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
} }
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
blk_mq_run_hw_queue(hctx, false); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
} }
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
@ -2242,6 +2239,8 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
*/ */
WARN_ON_ONCE(!async && in_interrupt()); WARN_ON_ONCE(!async && in_interrupt());
might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
/* /*
* When queue is quiesced, we may be switching io scheduler, or * When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue * updating nr_hw_queues, or other things, and we can't run queue
@ -2257,8 +2256,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
if (!need_run) if (!need_run)
return; return;
if (async || (hctx->flags & BLK_MQ_F_BLOCKING) || if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
blk_mq_delay_run_hw_queue(hctx, 0); blk_mq_delay_run_hw_queue(hctx, 0);
return; return;
} }
@ -2393,7 +2391,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{ {
clear_bit(BLK_MQ_S_STOPPED, &hctx->state); clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
blk_mq_run_hw_queue(hctx, false); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
} }
EXPORT_SYMBOL(blk_mq_start_hw_queue); EXPORT_SYMBOL(blk_mq_start_hw_queue);
@ -2423,7 +2421,8 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
unsigned long i; unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async); blk_mq_start_stopped_hw_queue(hctx, async ||
(hctx->flags & BLK_MQ_F_BLOCKING));
} }
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
@ -2481,6 +2480,8 @@ static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
list_for_each_entry(rq, list, queuelist) { list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx); BUG_ON(rq->mq_ctx != ctx);
trace_block_rq_insert(rq); trace_block_rq_insert(rq);
if (rq->cmd_flags & REQ_NOWAIT)
run_queue_async = true;
} }
spin_lock(&ctx->lock); spin_lock(&ctx->lock);
@ -2641,7 +2642,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
blk_mq_insert_request(rq, 0); blk_mq_insert_request(rq, 0);
blk_mq_run_hw_queue(hctx, false); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
return; return;
} }
@ -4853,6 +4854,9 @@ static int __init blk_mq_init(void)
for_each_possible_cpu(i) for_each_possible_cpu(i)
init_llist_head(&per_cpu(blk_cpu_done, i)); init_llist_head(&per_cpu(blk_cpu_done, i));
for_each_possible_cpu(i)
INIT_CSD(&per_cpu(blk_cpu_csd, i),
__blk_mq_complete_request_remote, NULL);
open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,

View File

@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth);
*/ */
void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
{ {
if (wc) if (wc) {
blk_queue_flag_set(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_set(QUEUE_FLAG_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q);
else } else {
blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_clear(QUEUE_FLAG_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q);
}
if (fua) if (fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q); blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else else

View File

@ -449,21 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page)
static ssize_t queue_wc_store(struct request_queue *q, const char *page, static ssize_t queue_wc_store(struct request_queue *q, const char *page,
size_t count) size_t count)
{ {
int set = -1; if (!strncmp(page, "write back", 10)) {
if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags))
if (!strncmp(page, "write back", 10)) return -EINVAL;
set = 1;
else if (!strncmp(page, "write through", 13) ||
!strncmp(page, "none", 4))
set = 0;
if (set == -1)
return -EINVAL;
if (set)
blk_queue_flag_set(QUEUE_FLAG_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q);
else } else if (!strncmp(page, "write through", 13) ||
!strncmp(page, "none", 4)) {
blk_queue_flag_clear(QUEUE_FLAG_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q);
} else {
return -EINVAL;
}
return count; return count;
} }

View File

@ -15,15 +15,14 @@ struct elevator_type;
extern struct dentry *blk_debugfs_root; extern struct dentry *blk_debugfs_root;
struct blk_flush_queue { struct blk_flush_queue {
spinlock_t mq_flush_lock;
unsigned int flush_pending_idx:1; unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1; unsigned int flush_running_idx:1;
blk_status_t rq_status; blk_status_t rq_status;
unsigned long flush_pending_since; unsigned long flush_pending_since;
struct list_head flush_queue[2]; struct list_head flush_queue[2];
struct list_head flush_data_in_flight; unsigned long flush_data_in_flight;
struct request *flush_rq; struct request *flush_rq;
spinlock_t mq_flush_lock;
}; };
bool is_flush_rq(struct request *req); bool is_flush_rq(struct request *req);
@ -76,6 +75,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask); gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, unsigned offset,
bool *same_page);
static inline bool biovec_phys_mergeable(struct request_queue *q, static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2) struct bio_vec *vec1, struct bio_vec *vec2)
{ {
@ -251,7 +254,6 @@ static inline void bio_integrity_free(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout); unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req); void blk_add_timer(struct request *req);
const char *blk_status_to_str(blk_status_t status);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs); unsigned int nr_segs);

View File

@ -15,6 +15,7 @@
#include <linux/falloc.h> #include <linux/falloc.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/module.h> #include <linux/module.h>
#include "blk.h" #include "blk.h"
@ -23,15 +24,6 @@ static inline struct inode *bdev_file_inode(struct file *file)
return file->f_mapping->host; return file->f_mapping->host;
} }
static int blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
static blk_opf_t dio_bio_write_op(struct kiocb *iocb) static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
{ {
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
@ -387,6 +379,37 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
} }
static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
struct block_device *bdev = I_BDEV(inode);
loff_t isize = i_size_read(inode);
iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
if (iomap->offset >= isize)
return -EIO;
iomap->type = IOMAP_MAPPED;
iomap->addr = iomap->offset;
iomap->length = isize - iomap->offset;
iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
return 0;
}
static const struct iomap_ops blkdev_iomap_ops = {
.iomap_begin = blkdev_iomap_begin,
};
#ifdef CONFIG_BUFFER_HEAD
static int blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
static int blkdev_writepage(struct page *page, struct writeback_control *wbc) static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
{ {
return block_write_full_page(page, blkdev_get_block, wbc); return block_write_full_page(page, blkdev_get_block, wbc);
@ -429,10 +452,58 @@ const struct address_space_operations def_blk_aops = {
.writepage = blkdev_writepage, .writepage = blkdev_writepage,
.write_begin = blkdev_write_begin, .write_begin = blkdev_write_begin,
.write_end = blkdev_write_end, .write_end = blkdev_write_end,
.direct_IO = blkdev_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs, .migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback, .is_dirty_writeback = buffer_check_dirty_writeback,
}; };
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
return iomap_read_folio(folio, &blkdev_iomap_ops);
}
static void blkdev_readahead(struct readahead_control *rac)
{
iomap_readahead(rac, &blkdev_iomap_ops);
}
static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
struct inode *inode, loff_t offset)
{
loff_t isize = i_size_read(inode);
if (WARN_ON_ONCE(offset >= isize))
return -EIO;
if (offset >= wpc->iomap.offset &&
offset < wpc->iomap.offset + wpc->iomap.length)
return 0;
return blkdev_iomap_begin(inode, offset, isize - offset,
IOMAP_WRITE, &wpc->iomap, NULL);
}
static const struct iomap_writeback_ops blkdev_writeback_ops = {
.map_blocks = blkdev_map_blocks,
};
static int blkdev_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct iomap_writepage_ctx wpc = { };
return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
}
const struct address_space_operations def_blk_aops = {
.dirty_folio = filemap_dirty_folio,
.release_folio = iomap_release_folio,
.invalidate_folio = iomap_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
.writepages = blkdev_writepages,
.is_partially_uptodate = iomap_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
.migrate_folio = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */
/* /*
* for a block special file file_inode(file)->i_size is zero * for a block special file file_inode(file)->i_size is zero
@ -506,7 +577,7 @@ static int blkdev_open(struct inode *inode, struct file *filp)
* during an unstable branch. * during an unstable branch.
*/ */
filp->f_flags |= O_LARGEFILE; filp->f_flags |= O_LARGEFILE;
filp->f_mode |= FMODE_BUF_RASYNC; filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
/* /*
* Use the file private data to store the holder for exclusive openes. * Use the file private data to store the holder for exclusive openes.
@ -534,6 +605,35 @@ static int blkdev_release(struct inode *inode, struct file *filp)
return 0; return 0;
} }
static ssize_t
blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
size_t count = iov_iter_count(from);
ssize_t written;
written = kiocb_invalidate_pages(iocb, count);
if (written) {
if (written == -EBUSY)
return 0;
return written;
}
written = blkdev_direct_IO(iocb, from);
if (written > 0) {
kiocb_invalidate_post_direct_write(iocb, count);
iocb->ki_pos += written;
count -= written;
}
if (written != -EIOCBQUEUED)
iov_iter_revert(from, count - iov_iter_count(from));
return written;
}
static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
{
return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
}
/* /*
* Write data to the block device. Only intended for the block device itself * Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device. * and the raw driver which basically is a fake block device.
@ -543,7 +643,8 @@ static int blkdev_release(struct inode *inode, struct file *filp)
*/ */
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{ {
struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct inode *bd_inode = bdev->bd_inode; struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev); loff_t size = bdev_nr_bytes(bdev);
size_t shorted = 0; size_t shorted = 0;
@ -570,7 +671,23 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
iov_iter_truncate(from, size); iov_iter_truncate(from, size);
} }
ret = __generic_file_write_iter(iocb, from); ret = file_remove_privs(file);
if (ret)
return ret;
ret = file_update_time(file);
if (ret)
return ret;
if (iocb->ki_flags & IOCB_DIRECT) {
ret = blkdev_direct_write(iocb, from);
if (ret >= 0 && iov_iter_count(from))
ret = direct_write_fallback(iocb, from, ret,
blkdev_buffered_write(iocb, from));
} else {
ret = blkdev_buffered_write(iocb, from);
}
if (ret > 0) if (ret > 0)
ret = generic_write_sync(iocb, ret); ret = generic_write_sync(iocb, ret);
iov_iter_reexpand(from, iov_iter_count(from) + shorted); iov_iter_reexpand(from, iov_iter_count(from) + shorted);

View File

@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags; struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int shift = tags->bitmap_tags.sb.shift;
dd->async_depth = max(1UL, 3 * q->nr_requests / 4); dd->async_depth = max(1U, 3 * (1U << shift) / 4);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
} }

View File

@ -2336,6 +2336,7 @@ static struct genl_family nbd_genl_family __ro_after_init = {
.mcgrps = nbd_mcast_grps, .mcgrps = nbd_mcast_grps,
.n_mcgrps = ARRAY_SIZE(nbd_mcast_grps), .n_mcgrps = ARRAY_SIZE(nbd_mcast_grps),
}; };
MODULE_ALIAS_GENL_FAMILY(NBD_GENL_FAMILY_NAME);
static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply) static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
{ {

View File

@ -1277,7 +1277,7 @@ static struct macio_driver swim3_driver =
}; };
int swim3_init(void) static int swim3_init(void)
{ {
macio_register_driver(&swim3_driver); macio_register_driver(&swim3_driver);
return 0; return 0;

View File

@ -56,16 +56,21 @@
| UBLK_F_USER_RECOVERY_REISSUE \ | UBLK_F_USER_RECOVERY_REISSUE \
| UBLK_F_UNPRIVILEGED_DEV \ | UBLK_F_UNPRIVILEGED_DEV \
| UBLK_F_CMD_IOCTL_ENCODE \ | UBLK_F_CMD_IOCTL_ENCODE \
| UBLK_F_USER_COPY) | UBLK_F_USER_COPY \
| UBLK_F_ZONED)
/* All UBLK_PARAM_TYPE_* should be included here */ /* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | \ #define UBLK_PARAM_TYPE_ALL \
UBLK_PARAM_TYPE_DISCARD | UBLK_PARAM_TYPE_DEVT) (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD | \
UBLK_PARAM_TYPE_DEVT | UBLK_PARAM_TYPE_ZONED)
struct ublk_rq_data { struct ublk_rq_data {
struct llist_node node; struct llist_node node;
struct kref ref; struct kref ref;
__u64 sector;
__u32 operation;
__u32 nr_zones;
}; };
struct ublk_uring_cmd_pdu { struct ublk_uring_cmd_pdu {
@ -185,6 +190,263 @@ struct ublk_params_header {
__u32 types; __u32 types;
}; };
static inline unsigned int ublk_req_build_flags(struct request *req);
static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
int tag);
static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_USER_COPY;
}
static inline bool ublk_dev_is_zoned(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_ZONED;
}
static inline bool ublk_queue_is_zoned(struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_ZONED;
}
#ifdef CONFIG_BLK_DEV_ZONED
static int ublk_get_nr_zones(const struct ublk_device *ub)
{
const struct ublk_param_basic *p = &ub->params.basic;
/* Zone size is a power of 2 */
return p->dev_sectors >> ilog2(p->chunk_sectors);
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
return blk_revalidate_disk_zones(ub->ub_disk, NULL);
}
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
int nr_zones;
if (!ublk_dev_is_zoned(ub))
return -EINVAL;
if (!p->max_zone_append_sectors)
return -EINVAL;
nr_zones = ublk_get_nr_zones(ub);
if (p->max_active_zones > nr_zones)
return -EINVAL;
if (p->max_open_zones > nr_zones)
return -EINVAL;
return 0;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
disk_set_zoned(ub->ub_disk, BLK_ZONED_HM);
blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
return 0;
}
/* Based on virtblk_alloc_report_buffer */
static void *ublk_alloc_report_buffer(struct ublk_device *ublk,
unsigned int nr_zones, size_t *buflen)
{
struct request_queue *q = ublk->ub_disk->queue;
size_t bufsize;
void *buf;
nr_zones = min_t(unsigned int, nr_zones,
ublk->ub_disk->nr_zones);
bufsize = nr_zones * sizeof(struct blk_zone);
bufsize =
min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT);
while (bufsize >= sizeof(struct blk_zone)) {
buf = kvmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
if (buf) {
*buflen = bufsize;
return buf;
}
bufsize >>= 1;
}
*buflen = 0;
return NULL;
}
static int ublk_report_zones(struct gendisk *disk, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data)
{
struct ublk_device *ub = disk->private_data;
unsigned int zone_size_sectors = disk->queue->limits.chunk_sectors;
unsigned int first_zone = sector >> ilog2(zone_size_sectors);
unsigned int done_zones = 0;
unsigned int max_zones_per_request;
int ret;
struct blk_zone *buffer;
size_t buffer_length;
nr_zones = min_t(unsigned int, ub->ub_disk->nr_zones - first_zone,
nr_zones);
buffer = ublk_alloc_report_buffer(ub, nr_zones, &buffer_length);
if (!buffer)
return -ENOMEM;
max_zones_per_request = buffer_length / sizeof(struct blk_zone);
while (done_zones < nr_zones) {
unsigned int remaining_zones = nr_zones - done_zones;
unsigned int zones_in_request =
min_t(unsigned int, remaining_zones, max_zones_per_request);
struct request *req;
struct ublk_rq_data *pdu;
blk_status_t status;
memset(buffer, 0, buffer_length);
req = blk_mq_alloc_request(disk->queue, REQ_OP_DRV_IN, 0);
if (IS_ERR(req)) {
ret = PTR_ERR(req);
goto out;
}
pdu = blk_mq_rq_to_pdu(req);
pdu->operation = UBLK_IO_OP_REPORT_ZONES;
pdu->sector = sector;
pdu->nr_zones = zones_in_request;
ret = blk_rq_map_kern(disk->queue, req, buffer, buffer_length,
GFP_KERNEL);
if (ret) {
blk_mq_free_request(req);
goto out;
}
status = blk_execute_rq(req, 0);
ret = blk_status_to_errno(status);
blk_mq_free_request(req);
if (ret)
goto out;
for (unsigned int i = 0; i < zones_in_request; i++) {
struct blk_zone *zone = buffer + i;
/* A zero length zone means no more zones in this response */
if (!zone->len)
break;
ret = cb(zone, i, data);
if (ret)
goto out;
done_zones++;
sector += zone_size_sectors;
}
}
ret = done_zones;
out:
kvfree(buffer);
return ret;
}
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
struct request *req)
{
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag];
struct ublk_rq_data *pdu = blk_mq_rq_to_pdu(req);
u32 ublk_op;
switch (req_op(req)) {
case REQ_OP_ZONE_OPEN:
ublk_op = UBLK_IO_OP_ZONE_OPEN;
break;
case REQ_OP_ZONE_CLOSE:
ublk_op = UBLK_IO_OP_ZONE_CLOSE;
break;
case REQ_OP_ZONE_FINISH:
ublk_op = UBLK_IO_OP_ZONE_FINISH;
break;
case REQ_OP_ZONE_RESET:
ublk_op = UBLK_IO_OP_ZONE_RESET;
break;
case REQ_OP_ZONE_APPEND:
ublk_op = UBLK_IO_OP_ZONE_APPEND;
break;
case REQ_OP_DRV_IN:
ublk_op = pdu->operation;
switch (ublk_op) {
case UBLK_IO_OP_REPORT_ZONES:
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_zones = pdu->nr_zones;
iod->start_sector = pdu->sector;
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
}
case REQ_OP_ZONE_RESET_ALL:
case REQ_OP_DRV_OUT:
/* We do not support reset_all and drv_out */
return BLK_STS_NOTSUPP;
default:
return BLK_STS_IOERR;
}
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
iod->addr = io->addr;
return BLK_STS_OK;
}
#else
#define ublk_report_zones (NULL)
static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
{
return -EOPNOTSUPP;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
return -EOPNOTSUPP;
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
{
return 0;
}
static blk_status_t ublk_setup_iod_zoned(struct ublk_queue *ubq,
struct request *req)
{
return BLK_STS_NOTSUPP;
}
#endif
static inline void __ublk_complete_rq(struct request *req); static inline void __ublk_complete_rq(struct request *req);
static void ublk_complete_rq(struct kref *ref); static void ublk_complete_rq(struct kref *ref);
@ -281,6 +543,9 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9)) if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
return -EINVAL; return -EINVAL;
if (ublk_dev_is_zoned(ub) && !p->chunk_sectors)
return -EINVAL;
} else } else
return -EINVAL; return -EINVAL;
@ -299,6 +564,11 @@ static int ublk_validate_params(const struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DEVT) if (ub->params.types & UBLK_PARAM_TYPE_DEVT)
return -EINVAL; return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_validate(ub);
else if (ublk_dev_is_zoned(ub))
return -EINVAL;
return 0; return 0;
} }
@ -312,6 +582,9 @@ static int ublk_apply_params(struct ublk_device *ub)
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub); ublk_dev_param_discard_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_apply(ub);
return 0; return 0;
} }
@ -482,6 +755,7 @@ static const struct block_device_operations ub_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.open = ublk_open, .open = ublk_open,
.free_disk = ublk_free_disk, .free_disk = ublk_free_disk,
.report_zones = ublk_report_zones,
}; };
#define UBLK_MAX_PIN_PAGES 32 #define UBLK_MAX_PIN_PAGES 32
@ -596,7 +870,8 @@ static inline bool ublk_need_map_req(const struct request *req)
static inline bool ublk_need_unmap_req(const struct request *req) static inline bool ublk_need_unmap_req(const struct request *req)
{ {
return ublk_rq_has_data(req) && req_op(req) == REQ_OP_READ; return ublk_rq_has_data(req) &&
(req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN);
} }
static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
@ -680,8 +955,13 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
{ {
struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
struct ublk_io *io = &ubq->ios[req->tag]; struct ublk_io *io = &ubq->ios[req->tag];
enum req_op op = req_op(req);
u32 ublk_op; u32 ublk_op;
if (!ublk_queue_is_zoned(ubq) &&
(op_is_zone_mgmt(op) || op == REQ_OP_ZONE_APPEND))
return BLK_STS_IOERR;
switch (req_op(req)) { switch (req_op(req)) {
case REQ_OP_READ: case REQ_OP_READ:
ublk_op = UBLK_IO_OP_READ; ublk_op = UBLK_IO_OP_READ;
@ -699,6 +979,8 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
ublk_op = UBLK_IO_OP_WRITE_ZEROES; ublk_op = UBLK_IO_OP_WRITE_ZEROES;
break; break;
default: default:
if (ublk_queue_is_zoned(ubq))
return ublk_setup_iod_zoned(ubq, req);
return BLK_STS_IOERR; return BLK_STS_IOERR;
} }
@ -751,7 +1033,8 @@ static inline void __ublk_complete_rq(struct request *req)
* *
* Both the two needn't unmap. * Both the two needn't unmap.
*/ */
if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE &&
req_op(req) != REQ_OP_DRV_IN)
goto exit; goto exit;
/* for READ request, writing data in iod->addr to rq buffers */ /* for READ request, writing data in iod->addr to rq buffers */
@ -1114,8 +1397,13 @@ static void ublk_commit_completion(struct ublk_device *ub,
/* find the io request and complete */ /* find the io request and complete */
req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
if (WARN_ON_ONCE(unlikely(!req)))
return;
if (req && likely(!blk_should_fake_timeout(req->q))) if (req_op(req) == REQ_OP_ZONE_APPEND)
req->__sector = ub_cmd->zone_append_lba;
if (likely(!blk_should_fake_timeout(req->q)))
ublk_put_req_ref(ubq, req); ublk_put_req_ref(ubq, req);
} }
@ -1414,11 +1702,6 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA)) ^ (_IOC_NR(cmd_op) == UBLK_IO_NEED_GET_DATA))
goto out; goto out;
if (ublk_support_user_copy(ubq) && ub_cmd->addr) {
ret = -EINVAL;
goto out;
}
ret = ublk_check_cmd_op(cmd_op); ret = ublk_check_cmd_op(cmd_op);
if (ret) if (ret)
goto out; goto out;
@ -1445,6 +1728,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
*/ */
if (!ub_cmd->addr && !ublk_need_get_data(ubq)) if (!ub_cmd->addr && !ublk_need_get_data(ubq))
goto out; goto out;
} else if (ub_cmd->addr) {
/* User copy requires addr to be unset */
ret = -EINVAL;
goto out;
} }
ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
@ -1464,7 +1751,15 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || if (!ub_cmd->addr && (!ublk_need_get_data(ubq) ||
req_op(req) == REQ_OP_READ)) req_op(req) == REQ_OP_READ))
goto out; goto out;
} else if (req_op(req) != REQ_OP_ZONE_APPEND && ub_cmd->addr) {
/*
* User copy requires addr to be unset when command is
* not zone append
*/
ret = -EINVAL;
goto out;
} }
ublk_fill_io_cmd(io, cmd, ub_cmd->addr); ublk_fill_io_cmd(io, cmd, ub_cmd->addr);
ublk_commit_completion(ub, ub_cmd); ublk_commit_completion(ub, ub_cmd);
break; break;
@ -1537,11 +1832,14 @@ static inline bool ublk_check_ubuf_dir(const struct request *req,
int ubuf_dir) int ubuf_dir)
{ {
/* copy ubuf to request pages */ /* copy ubuf to request pages */
if (req_op(req) == REQ_OP_READ && ubuf_dir == ITER_SOURCE) if ((req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_DRV_IN) &&
ubuf_dir == ITER_SOURCE)
return true; return true;
/* copy request pages to ubuf */ /* copy request pages to ubuf */
if (req_op(req) == REQ_OP_WRITE && ubuf_dir == ITER_DEST) if ((req_op(req) == REQ_OP_WRITE ||
req_op(req) == REQ_OP_ZONE_APPEND) &&
ubuf_dir == ITER_DEST)
return true; return true;
return false; return false;
@ -1881,17 +2179,24 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
get_device(&ub->cdev_dev); get_device(&ub->cdev_dev);
ub->dev_info.state = UBLK_S_DEV_LIVE; ub->dev_info.state = UBLK_S_DEV_LIVE;
if (ublk_dev_is_zoned(ub)) {
ret = ublk_revalidate_disk_zones(ub);
if (ret)
goto out_put_cdev;
}
ret = add_disk(disk); ret = add_disk(disk);
if (ret)
goto out_put_cdev;
set_bit(UB_STATE_USED, &ub->state);
out_put_cdev:
if (ret) { if (ret) {
/*
* Has to drop the reference since ->free_disk won't be
* called in case of add_disk failure.
*/
ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub); ublk_put_device(ub);
goto out_put_disk;
} }
set_bit(UB_STATE_USED, &ub->state);
out_put_disk: out_put_disk:
if (ret) if (ret)
put_disk(disk); put_disk(disk);
@ -2038,9 +2343,16 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
UBLK_F_URING_CMD_COMP_IN_TASK; UBLK_F_URING_CMD_COMP_IN_TASK;
/* GET_DATA isn't needed any more with USER_COPY */ /* GET_DATA isn't needed any more with USER_COPY */
if (ub->dev_info.flags & UBLK_F_USER_COPY) if (ublk_dev_is_user_copy(ub))
ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA; ub->dev_info.flags &= ~UBLK_F_NEED_GET_DATA;
/* Zoned storage support requires user copy feature */
if (ublk_dev_is_zoned(ub) &&
(!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || !ublk_dev_is_user_copy(ub))) {
ret = -EINVAL;
goto out_free_dev_number;
}
/* We are not ready to support zero copy */ /* We are not ready to support zero copy */
ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
@ -2433,14 +2745,9 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
if (header->len < header->dev_path_len) if (header->len < header->dev_path_len)
return -EINVAL; return -EINVAL;
dev_path = kmalloc(header->dev_path_len + 1, GFP_KERNEL); dev_path = memdup_user_nul(argp, header->dev_path_len);
if (!dev_path) if (IS_ERR(dev_path))
return -ENOMEM; return PTR_ERR(dev_path);
ret = -EFAULT;
if (copy_from_user(dev_path, argp, header->dev_path_len))
goto exit;
dev_path[header->dev_path_len] = 0;
ret = -EINVAL; ret = -EINVAL;
switch (_IOC_NR(cmd->cmd_op)) { switch (_IOC_NR(cmd->cmd_op)) {

View File

@ -15,6 +15,7 @@ if MD
config BLK_DEV_MD config BLK_DEV_MD
tristate "RAID support" tristate "RAID support"
select BLOCK_HOLDER_DEPRECATED if SYSFS select BLOCK_HOLDER_DEPRECATED if SYSFS
select BUFFER_HEAD
# BLOCK_LEGACY_AUTOLOAD requirement should be removed # BLOCK_LEGACY_AUTOLOAD requirement should be removed
# after relevant mdadm enhancements - to make "names=yes" # after relevant mdadm enhancements - to make "names=yes"
# the default - are widely available. # the default - are widely available.
@ -50,6 +51,16 @@ config MD_AUTODETECT
If unsure, say Y. If unsure, say Y.
config MD_BITMAP_FILE
bool "MD bitmap file support (deprecated)"
default y
help
If you say Y here, support for write intent bitmaps in files on an
external file system is enabled. This is an alternative to the internal
bitmaps near the MD superblock, and very problematic code that abuses
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
config MD_LINEAR config MD_LINEAR
tristate "Linear (append) mode (deprecated)" tristate "Linear (append) mode (deprecated)"
depends on BLK_DEV_MD depends on BLK_DEV_MD

View File

@ -1160,7 +1160,6 @@ static int dm_crypt_integrity_io_alloc(struct dm_crypt_io *io, struct bio *bio)
tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift); tag_len = io->cc->on_disk_tag_size * (bio_sectors(bio) >> io->cc->sector_shift);
bip->bip_iter.bi_size = tag_len;
bip->bip_iter.bi_sector = io->cc->start + io->sector; bip->bip_iter.bi_sector = io->cc->start + io->sector;
ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata), ret = bio_integrity_add_page(bio, virt_to_page(io->integrity_metadata),

View File

@ -3723,7 +3723,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) { if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
} }
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)

View File

@ -139,29 +139,26 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page
*/ */
/* IO operations when bitmap is stored near all superblocks */ /* IO operations when bitmap is stored near all superblocks */
static int read_sb_page(struct mddev *mddev, loff_t offset,
struct page *page,
unsigned long index, int size)
{
/* choose a good rdev and read the page from there */
/* choose a good rdev and read the page from there */
static int read_sb_page(struct mddev *mddev, loff_t offset,
struct page *page, unsigned long index, int size)
{
sector_t sector = mddev->bitmap_info.offset + offset +
index * (PAGE_SIZE / SECTOR_SIZE);
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t target;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags) u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
|| test_bit(Faulty, &rdev->flags)
|| test_bit(Bitmap_sync, &rdev->flags)) if (!test_bit(In_sync, &rdev->flags) ||
test_bit(Faulty, &rdev->flags) ||
test_bit(Bitmap_sync, &rdev->flags))
continue; continue;
target = offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
page, REQ_OP_READ, true)) {
page->index = index;
return 0; return 0;
}
} }
return -EIO; return -EIO;
} }
@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
} }
static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct page *page) unsigned long pg_index, struct page *page)
{ {
struct block_device *bdev; struct block_device *bdev;
struct mddev *mddev = bitmap->mddev; struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
loff_t sboff, offset = mddev->bitmap_info.offset; loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps, doff; sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE; unsigned int size = PAGE_SIZE;
unsigned int opt_size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE;
sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (page->index == store->file_pages - 1) { if (pg_index == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0) if (last_page_size == 0)
@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
opt_size = optimal_io_size(bdev, last_page_size, size); opt_size = optimal_io_size(bdev, last_page_size, size);
} }
ps = page->index * PAGE_SIZE / SECTOR_SIZE;
sboff = rdev->sb_start + offset; sboff = rdev->sb_start + offset;
doff = rdev->data_offset; doff = rdev->data_offset;
@ -279,55 +276,41 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return 0; return 0;
} }
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page, bool wait)
{ {
struct md_rdev *rdev;
struct mddev *mddev = bitmap->mddev; struct mddev *mddev = bitmap->mddev;
int ret;
do { do {
rdev = NULL; struct md_rdev *rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
ret = __write_sb_page(rdev, bitmap, page); if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
if (ret) set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
return ret; return;
}
} }
} while (wait && md_super_wait(mddev) < 0); } while (wait && md_super_wait(mddev) < 0);
return 0;
} }
static void md_bitmap_file_kick(struct bitmap *bitmap); static void md_bitmap_file_kick(struct bitmap *bitmap);
/*
* write out a page to a file #ifdef CONFIG_MD_BITMAP_FILE
*/ static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
static void write_page(struct bitmap *bitmap, struct page *page, int wait)
{ {
struct buffer_head *bh; struct buffer_head *bh = page_buffers(page);
if (bitmap->storage.file == NULL) { while (bh && bh->b_blocknr) {
switch (write_sb_page(bitmap, page, wait)) { atomic_inc(&bitmap->pending_writes);
case -EINVAL: set_buffer_locked(bh);
set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); set_buffer_mapped(bh);
} submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
} else { bh = bh->b_this_page;
bh = page_buffers(page);
while (bh && bh->b_blocknr) {
atomic_inc(&bitmap->pending_writes);
set_buffer_locked(bh);
set_buffer_mapped(bh);
submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
bh = bh->b_this_page;
}
if (wait)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
} }
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap); if (wait)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes) == 0);
} }
static void end_bitmap_write(struct buffer_head *bh, int uptodate) static void end_bitmap_write(struct buffer_head *bh, int uptodate)
@ -364,10 +347,8 @@ static void free_buffers(struct page *page)
* This usage is similar to how swap files are handled, and allows us * This usage is similar to how swap files are handled, and allows us
* to write to a file with no concerns of memory allocation failing. * to write to a file with no concerns of memory allocation failing.
*/ */
static int read_page(struct file *file, unsigned long index, static int read_file_page(struct file *file, unsigned long index,
struct bitmap *bitmap, struct bitmap *bitmap, unsigned long count, struct page *page)
unsigned long count,
struct page *page)
{ {
int ret = 0; int ret = 0;
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
@ -415,7 +396,6 @@ static int read_page(struct file *file, unsigned long index,
blk_cur++; blk_cur++;
bh = bh->b_this_page; bh = bh->b_this_page;
} }
page->index = index;
wait_event(bitmap->write_wait, wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0); atomic_read(&bitmap->pending_writes)==0);
@ -429,11 +409,45 @@ out:
ret); ret);
return ret; return ret;
} }
#else /* CONFIG_MD_BITMAP_FILE */
static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
{
}
static int read_file_page(struct file *file, unsigned long index,
struct bitmap *bitmap, unsigned long count, struct page *page)
{
return -EIO;
}
static void free_buffers(struct page *page)
{
put_page(page);
}
#endif /* CONFIG_MD_BITMAP_FILE */
/* /*
* bitmap file superblock operations * bitmap file superblock operations
*/ */
/*
* write out a page to a file
*/
static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
bool wait)
{
struct bitmap_storage *store = &bitmap->storage;
struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) {
pg_index += bitmap->cluster_slot *
DIV_ROUND_UP(store->bytes, PAGE_SIZE);
}
if (store->file)
write_file_page(bitmap, page, wait);
else
write_sb_page(bitmap, pg_index, page, wait);
}
/* /*
* md_bitmap_wait_writes() should be called before writing any bitmap * md_bitmap_wait_writes() should be called before writing any bitmap
* blocks, to ensure previous writes, particularly from * blocks, to ensure previous writes, particularly from
@ -488,7 +502,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space); bitmap_info.space);
kunmap_atomic(sb); kunmap_atomic(sb);
write_page(bitmap, bitmap->storage.sb_page, 1);
if (bitmap->storage.file)
write_file_page(bitmap, bitmap->storage.sb_page, 1);
else
write_sb_page(bitmap, bitmap->storage.sb_index,
bitmap->storage.sb_page, 1);
} }
EXPORT_SYMBOL(md_bitmap_update_sb); EXPORT_SYMBOL(md_bitmap_update_sb);
@ -540,7 +559,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL) if (bitmap->storage.sb_page == NULL)
return -ENOMEM; return -ENOMEM;
bitmap->storage.sb_page->index = 0; bitmap->storage.sb_index = 0;
sb = kmap_atomic(bitmap->storage.sb_page); sb = kmap_atomic(bitmap->storage.sb_page);
@ -601,7 +620,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
unsigned long sectors_reserved = 0; unsigned long sectors_reserved = 0;
int err = -EINVAL; int err = -EINVAL;
struct page *sb_page; struct page *sb_page;
loff_t offset = bitmap->mddev->bitmap_info.offset; loff_t offset = 0;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024; chunksize = 128 * 1024 * 1024;
@ -628,7 +647,7 @@ re_read:
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */ /* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); offset = bitmap->cluster_slot * (bm_blocks << 3);
pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
bitmap->cluster_slot, offset); bitmap->cluster_slot, offset);
} }
@ -637,13 +656,11 @@ re_read:
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
err = read_page(bitmap->storage.file, 0, err = read_file_page(bitmap->storage.file, 0,
bitmap, bytes, sb_page); bitmap, bytes, sb_page);
} else { } else {
err = read_sb_page(bitmap->mddev, err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
offset, sizeof(bitmap_super_t));
sb_page,
0, sizeof(bitmap_super_t));
} }
if (err) if (err)
return err; return err;
@ -819,7 +836,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
if (store->sb_page) { if (store->sb_page) {
store->filemap[0] = store->sb_page; store->filemap[0] = store->sb_page;
pnum = 1; pnum = 1;
store->sb_page->index = offset; store->sb_index = offset;
} }
for ( ; pnum < num_pages; pnum++) { for ( ; pnum < num_pages; pnum++) {
@ -828,7 +845,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
store->file_pages = pnum; store->file_pages = pnum;
return -ENOMEM; return -ENOMEM;
} }
store->filemap[pnum]->index = pnum + offset;
} }
store->file_pages = pnum; store->file_pages = pnum;
@ -847,14 +863,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
static void md_bitmap_file_unmap(struct bitmap_storage *store) static void md_bitmap_file_unmap(struct bitmap_storage *store)
{ {
struct page **map, *sb_page; struct file *file = store->file;
int pages; struct page *sb_page = store->sb_page;
struct file *file; struct page **map = store->filemap;
int pages = store->file_pages;
file = store->file;
map = store->filemap;
pages = store->file_pages;
sb_page = store->sb_page;
while (pages--) while (pages--)
if (map[pages] != sb_page) /* 0 is sb_page, release it below */ if (map[pages] != sb_page) /* 0 is sb_page, release it below */
@ -879,21 +891,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
*/ */
static void md_bitmap_file_kick(struct bitmap *bitmap) static void md_bitmap_file_kick(struct bitmap *bitmap)
{ {
char *path, *ptr = NULL;
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
md_bitmap_update_sb(bitmap); md_bitmap_update_sb(bitmap);
if (bitmap->storage.file) { if (bitmap->storage.file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL); pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
if (path) bmname(bitmap), bitmap->storage.file);
ptr = file_path(bitmap->storage.file,
path, PAGE_SIZE);
pr_warn("%s: kicking failed bitmap file %s from array!\n",
bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
kfree(path);
} else } else
pr_warn("%s: disabling internal bitmap due to errors\n", pr_warn("%s: disabling internal bitmap due to errors\n",
bmname(bitmap)); bmname(bitmap));
@ -945,6 +949,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
void *kaddr; void *kaddr;
unsigned long chunk = block >> bitmap->counts.chunkshift; unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0; unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev)) if (mddev_is_clustered(bitmap->mddev))
@ -962,9 +967,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
else else
set_bit_le(bit, kaddr); set_bit_le(bit, kaddr);
kunmap_atomic(kaddr); kunmap_atomic(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, page->index); pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */ /* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
} }
static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
@ -974,6 +979,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
void *paddr; void *paddr;
unsigned long chunk = block >> bitmap->counts.chunkshift; unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0; unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev)) if (mddev_is_clustered(bitmap->mddev))
@ -989,8 +995,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
else else
clear_bit_le(bit, paddr); clear_bit_le(bit, paddr);
kunmap_atomic(paddr); kunmap_atomic(paddr);
if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0; bitmap->allclean = 0;
} }
} }
@ -1042,7 +1048,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
"md bitmap_unplug"); "md bitmap_unplug");
} }
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
write_page(bitmap, bitmap->storage.filemap[i], 0); filemap_write_page(bitmap, i, false);
writing = 1; writing = 1;
} }
} }
@ -1084,33 +1090,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
EXPORT_SYMBOL(md_bitmap_unplug_async); EXPORT_SYMBOL(md_bitmap_unplug_async);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
* the in-memory bitmap from the on-disk bitmap -- also, sets up the /*
* memory mapping of the bitmap file * Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
* Special cases: * mapping of the bitmap file.
* if there's no bitmap file, or if the bitmap file had been *
* previously kicked from the array, we mark all the bits as * Special case: If there's no bitmap file, or if the bitmap file had been
* 1's in order to cause a full resync. * previously kicked from the array, we mark all the bits as 1's in order to
* cause a full resync.
* *
* We ignore all bits for sectors that end earlier than 'start'. * We ignore all bits for sectors that end earlier than 'start'.
* This is used when reading an out-of-date bitmap... * This is used when reading an out-of-date bitmap.
*/ */
static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{ {
unsigned long i, chunks, index, oldindex, bit, node_offset = 0; bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
struct page *page = NULL; struct mddev *mddev = bitmap->mddev;
unsigned long bit_cnt = 0; unsigned long chunks = bitmap->counts.chunks;
struct file *file;
unsigned long offset;
int outofdate;
int ret = -ENOSPC;
void *paddr;
struct bitmap_storage *store = &bitmap->storage; struct bitmap_storage *store = &bitmap->storage;
struct file *file = store->file;
unsigned long node_offset = 0;
unsigned long bit_cnt = 0;
unsigned long i;
int ret;
chunks = bitmap->counts.chunks; if (!file && !mddev->bitmap_info.offset) {
file = store->file;
if (!file && !bitmap->mddev->bitmap_info.offset) {
/* No permanent bitmap - fill with '1s'. */ /* No permanent bitmap - fill with '1s'. */
store->filemap = NULL; store->filemap = NULL;
store->file_pages = 0; store->file_pages = 0;
@ -1125,77 +1129,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return 0; return 0;
} }
outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
if (outofdate)
pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
if (file && i_size_read(file->f_mapping->host) < store->bytes) { if (file && i_size_read(file->f_mapping->host) < store->bytes) {
pr_warn("%s: bitmap file too short %lu < %lu\n", pr_warn("%s: bitmap file too short %lu < %lu\n",
bmname(bitmap), bmname(bitmap),
(unsigned long) i_size_read(file->f_mapping->host), (unsigned long) i_size_read(file->f_mapping->host),
store->bytes); store->bytes);
ret = -ENOSPC;
goto err; goto err;
} }
oldindex = ~0L; if (mddev_is_clustered(mddev))
offset = 0;
if (!bitmap->mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
for (i = 0; i < chunks; i++) { for (i = 0; i < store->file_pages; i++) {
int b; struct page *page = store->filemap[i];
index = file_page_index(&bitmap->storage, i); int count;
bit = file_page_offset(&bitmap->storage, i);
if (index != oldindex) { /* this is a new page, read it in */
int count;
/* unmap the old page, we're done with it */
if (index == store->file_pages-1)
count = store->bytes - index * PAGE_SIZE;
else
count = PAGE_SIZE;
page = store->filemap[index];
if (file)
ret = read_page(file, index, bitmap,
count, page);
else
ret = read_sb_page(
bitmap->mddev,
bitmap->mddev->bitmap_info.offset,
page,
index + node_offset, count);
if (ret) /* unmap the old page, we're done with it */
goto err; if (i == store->file_pages - 1)
count = store->bytes - i * PAGE_SIZE;
else
count = PAGE_SIZE;
oldindex = index; if (file)
ret = read_file_page(file, i, bitmap, count, page);
else
ret = read_sb_page(mddev, 0, page, i + node_offset,
count);
if (ret)
goto err;
}
if (outofdate) { if (outofdate) {
/* pr_warn("%s: bitmap file is out of date, doing full recovery\n",
* if bitmap is out of date, dirty the bmname(bitmap));
* whole page and write it out
*/
paddr = kmap_atomic(page);
memset(paddr + offset, 0xff,
PAGE_SIZE - offset);
kunmap_atomic(paddr);
write_page(bitmap, page, 1);
for (i = 0; i < store->file_pages; i++) {
struct page *page = store->filemap[i];
unsigned long offset = 0;
void *paddr;
if (i == 0 && !mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
/*
* If the bitmap is out of date, dirty the whole page
* and write it out
*/
paddr = kmap_atomic(page);
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
kunmap_atomic(paddr);
filemap_write_page(bitmap, i, true);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
ret = -EIO; ret = -EIO;
if (test_bit(BITMAP_WRITE_ERROR, goto err;
&bitmap->flags))
goto err;
} }
} }
}
for (i = 0; i < chunks; i++) {
struct page *page = filemap_get_page(&bitmap->storage, i);
unsigned long bit = file_page_offset(&bitmap->storage, i);
void *paddr;
bool was_set;
paddr = kmap_atomic(page); paddr = kmap_atomic(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
b = test_bit(bit, paddr); was_set = test_bit(bit, paddr);
else else
b = test_bit_le(bit, paddr); was_set = test_bit_le(bit, paddr);
kunmap_atomic(paddr); kunmap_atomic(paddr);
if (b) {
if (was_set) {
/* if the disk bit is set, set the memory bit */ /* if the disk bit is set, set the memory bit */
int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
>= start); >= start);
@ -1204,7 +1210,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
needed); needed);
bit_cnt++; bit_cnt++;
} }
offset = 0;
} }
pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
@ -1396,9 +1401,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
break; break;
if (bitmap->storage.filemap && if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j, test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) { BITMAP_PAGE_NEEDWRITE))
write_page(bitmap, bitmap->storage.filemap[j], 0); filemap_write_page(bitmap, j, false);
}
} }
done: done:
@ -2542,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX) if (backlog > COUNTER_MAX)
return -EINVAL; return -EINVAL;
rv = mddev_lock(mddev);
if (rv)
return rv;
/* /*
* Without write mostly device, it doesn't make sense to set * Without write mostly device, it doesn't make sense to set
* backlog for max_write_behind. * backlog for max_write_behind.
@ -2555,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!has_write_mostly) { if (!has_write_mostly) {
pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n", pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
mdname(mddev)); mdname(mddev));
mddev_unlock(mddev);
return -EINVAL; return -EINVAL;
} }
@ -2565,13 +2574,13 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_destroy_serial_pool(mddev, NULL, false); mddev_destroy_serial_pool(mddev, NULL, false);
} else if (backlog && !mddev->serial_info_pool) { } else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */ /* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) rdev_for_each(rdev, mddev)
mddev_create_serial_pool(mddev, rdev, false); mddev_create_serial_pool(mddev, rdev, false);
} }
if (old_mwb != backlog) if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap); md_bitmap_update_sb(mddev->bitmap);
mddev_unlock(mddev);
return len; return len;
} }

View File

@ -201,6 +201,7 @@ struct bitmap {
struct file *file; /* backing disk file */ struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap struct page *sb_page; /* cached copy of the bitmap
* file superblock */ * file superblock */
unsigned long sb_index;
struct page **filemap; /* list of cache pages for struct page **filemap; /* list of cache pages for
* the file */ * the file */
unsigned long *filemap_attr; /* attributes associated unsigned long *filemap_attr; /* attributes associated

View File

@ -952,8 +952,8 @@ static int join(struct mddev *mddev, int nodes)
return 0; return 0;
err: err:
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(mddev, &cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread); md_unregister_thread(mddev, &cinfo->recv_thread);
lockres_free(cinfo->message_lockres); lockres_free(cinfo->message_lockres);
lockres_free(cinfo->token_lockres); lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres); lockres_free(cinfo->ack_lockres);
@ -1015,8 +1015,8 @@ static int leave(struct mddev *mddev)
resync_bitmap(mddev); resync_bitmap(mddev);
set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state); set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(mddev, &cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread); md_unregister_thread(mddev, &cinfo->recv_thread);
lockres_free(cinfo->message_lockres); lockres_free(cinfo->message_lockres);
lockres_free(cinfo->token_lockres); lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres); lockres_free(cinfo->ack_lockres);

View File

@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
failit = 1; failit = 1;
} }
} }
md_account_bio(mddev, &bio);
if (failit) { if (failit) {
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO, struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
&mddev->bio_set); &mddev->bio_set);

View File

@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio = split; bio = split;
} }
md_account_bio(mddev, &bio);
bio_set_dev(bio, tmp_dev->rdev->bdev); bio_set_dev(bio, tmp_dev->rdev->bdev);
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
start_sector + data_offset; start_sector + data_offset;

View File

@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
&& md_flush_request(mddev, bio)) && md_flush_request(mddev, bio))
return true; return true;
md_account_bio(mddev, &bio);
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio; mp_bh->master_bio = bio;

View File

@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev)
mddev->pers->prepare_suspend(mddev); mddev->pers->prepare_suspend(mddev);
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
mddev->pers->quiesce(mddev, 1);
clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
@ -465,14 +464,15 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev) void mddev_resume(struct mddev *mddev)
{ {
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
lockdep_assert_held(&mddev->reconfig_mutex); lockdep_assert_held(&mddev->reconfig_mutex);
if (--mddev->suspended) if (--mddev->suspended)
return; return;
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
percpu_ref_resurrect(&mddev->active_io); percpu_ref_resurrect(&mddev->active_io);
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev)
{ {
mutex_init(&mddev->open_mutex); mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
mutex_init(&mddev->bitmap_info.mutex); mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks); INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs); INIT_LIST_HEAD(&mddev->all_mddevs);
@ -650,6 +651,7 @@ void mddev_init(struct mddev *mddev)
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1); atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0); atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock); spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0); atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->sb_wait);
@ -2304,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev)
pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
(mddev->level != 1 && mddev->level != 10 && (mddev->level != 1 && mddev->level != 10 &&
bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
/* /*
* No need to handle the failure of bioset_integrity_create, * No need to handle the failure of bioset_integrity_create,
* because the function is called by md_run() -> pers->run(), * because the function is called by md_run() -> pers->run(),
@ -4747,6 +4749,62 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type); return sprintf(page, "%s\n", type);
} }
static void stop_sync_thread(struct mddev *mddev)
{
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return;
if (mddev_lock(mddev))
return;
/*
* Check again in case MD_RECOVERY_RUNNING is cleared before lock is
* held.
*/
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
mddev_unlock(mddev);
return;
}
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
/*
* Thread might be blocked waiting for metadata update which will now
* never happen
*/
md_wakeup_thread_directly(mddev->sync_thread);
mddev_unlock(mddev);
}
static void idle_sync_thread(struct mddev *mddev)
{
int sync_seq = atomic_read(&mddev->sync_seq);
mutex_lock(&mddev->sync_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev);
wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
mutex_unlock(&mddev->sync_mutex);
}
static void frozen_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev);
wait_event(resync_wait, mddev->sync_thread == NULL &&
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
mutex_unlock(&mddev->sync_mutex);
}
static ssize_t static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len) action_store(struct mddev *mddev, const char *page, size_t len)
{ {
@ -4754,35 +4812,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL; return -EINVAL;
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { if (cmd_match(page, "idle"))
if (cmd_match(page, "frozen")) idle_sync_thread(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (cmd_match(page, "frozen"))
else frozen_sync_thread(mddev);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
mddev_lock(mddev) == 0) {
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
sector_t save_rp = mddev->reshape_position;
mddev_unlock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
mddev_lock_nointr(mddev);
/*
* set RECOVERY_INTR again and restore reshape
* position in case others changed them after
* got lock, eg, reshape_position_store and
* md_check_recovery.
*/
mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
mddev_unlock(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY; return -EBUSY;
else if (cmd_match(page, "resync")) else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
@ -5842,6 +5876,13 @@ int md_run(struct mddev *mddev)
goto exit_bio_set; goto exit_bio_set;
} }
if (!bioset_initialized(&mddev->io_clone_set)) {
err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
offsetof(struct md_io_clone, bio_clone), 0);
if (err)
goto exit_sync_set;
}
spin_lock(&pers_lock); spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel); pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) { if (!pers || !try_module_get(pers->owner)) {
@ -6019,6 +6060,8 @@ bitmap_abort:
module_put(pers->owner); module_put(pers->owner);
md_bitmap_destroy(mddev); md_bitmap_destroy(mddev);
abort: abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
bioset_exit(&mddev->sync_set); bioset_exit(&mddev->sync_set);
exit_bio_set: exit_bio_set:
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->bio_set);
@ -6176,7 +6219,6 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq); flush_workqueue(md_misc_wq);
if (mddev->sync_thread) { if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
} }
@ -6216,7 +6258,7 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0); mddev->pers->quiesce(mddev, 0);
} }
md_unregister_thread(&mddev->thread); md_unregister_thread(mddev, &mddev->thread);
if (mddev->queue) if (mddev->queue)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
} }
@ -6243,6 +6285,7 @@ static void __md_stop(struct mddev *mddev)
percpu_ref_exit(&mddev->active_io); percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set); bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set); bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set);
} }
void md_stop(struct mddev *mddev) void md_stop(struct mddev *mddev)
@ -7012,6 +7055,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (mddev->bitmap || mddev->bitmap_info.file) if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */ return -EEXIST; /* cannot add when bitmap is present */
if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
pr_warn("%s: bitmap files not supported by this kernel\n",
mdname(mddev));
return -EINVAL;
}
pr_warn("%s: using deprecated bitmap file support\n",
mdname(mddev));
f = fget(fd); f = fget(fd);
if (f == NULL) { if (f == NULL) {
@ -7940,9 +7992,10 @@ struct md_thread *md_register_thread(void (*run) (struct md_thread *),
} }
EXPORT_SYMBOL(md_register_thread); EXPORT_SYMBOL(md_register_thread);
void md_unregister_thread(struct md_thread __rcu **threadp) void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp)
{ {
struct md_thread *thread = rcu_dereference_protected(*threadp, true); struct md_thread *thread = rcu_dereference_protected(*threadp,
lockdep_is_held(&mddev->reconfig_mutex));
if (!thread) if (!thread)
return; return;
@ -8601,63 +8654,45 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
} }
EXPORT_SYMBOL_GPL(md_submit_discard_bio); EXPORT_SYMBOL_GPL(md_submit_discard_bio);
int acct_bioset_init(struct mddev *mddev) static void md_end_clone_io(struct bio *bio)
{ {
int err = 0; struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
if (!bioset_initialized(&mddev->io_acct_set)) struct mddev *mddev = md_io_clone->mddev;
err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
offsetof(struct md_io_acct, bio_clone), 0);
return err;
}
EXPORT_SYMBOL_GPL(acct_bioset_init);
void acct_bioset_exit(struct mddev *mddev)
{
bioset_exit(&mddev->io_acct_set);
}
EXPORT_SYMBOL_GPL(acct_bioset_exit);
static void md_end_io_acct(struct bio *bio)
{
struct md_io_acct *md_io_acct = bio->bi_private;
struct bio *orig_bio = md_io_acct->orig_bio;
struct mddev *mddev = md_io_acct->mddev;
orig_bio->bi_status = bio->bi_status; orig_bio->bi_status = bio->bi_status;
bio_end_io_acct(orig_bio, md_io_acct->start_time); if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
bio_put(bio); bio_put(bio);
bio_endio(orig_bio); bio_endio(orig_bio);
percpu_ref_put(&mddev->active_io); percpu_ref_put(&mddev->active_io);
} }
/* static void md_clone_bio(struct mddev *mddev, struct bio **bio)
* Used by personalities that don't already clone the bio and thus can't
* easily add the timestamp to their extended bio structure.
*/
void md_account_bio(struct mddev *mddev, struct bio **bio)
{ {
struct block_device *bdev = (*bio)->bi_bdev; struct block_device *bdev = (*bio)->bi_bdev;
struct md_io_acct *md_io_acct; struct md_io_clone *md_io_clone;
struct bio *clone; struct bio *clone =
bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
if (!blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
return; md_io_clone->orig_bio = *bio;
md_io_clone->mddev = mddev;
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
percpu_ref_get(&mddev->active_io); clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
md_io_acct->orig_bio = *bio;
md_io_acct->start_time = bio_start_io_acct(*bio);
md_io_acct->mddev = mddev;
clone->bi_end_io = md_end_io_acct;
clone->bi_private = md_io_acct;
*bio = clone; *bio = clone;
} }
void md_account_bio(struct mddev *mddev, struct bio **bio)
{
percpu_ref_get(&mddev->active_io);
md_clone_bio(mddev, bio);
}
EXPORT_SYMBOL_GPL(md_account_bio); EXPORT_SYMBOL_GPL(md_account_bio);
/* md_allow_write(mddev) /* md_allow_write(mddev)
@ -9329,7 +9364,6 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk * ->spare_active and clear saved_raid_disk
*/ */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@ -9358,17 +9392,24 @@ void md_check_recovery(struct mddev *mddev)
if (mddev->sb_flags) if (mddev->sb_flags)
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && /*
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { * Never start a new sync thread if MD_RECOVERY_RUNNING is
/* resync/recovery still happening */ * still set.
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); */
goto unlock; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
} if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
if (mddev->sync_thread) { /* resync/recovery still happening */
md_unregister_thread(&mddev->sync_thread); clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
if (WARN_ON_ONCE(!mddev->sync_thread))
goto unlock;
md_reap_sync_thread(mddev); md_reap_sync_thread(mddev);
goto unlock; goto unlock;
} }
/* Set RUNNING before clearing NEEDED to avoid /* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action". * any transients in the value of "sync_action".
*/ */
@ -9445,7 +9486,10 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors; sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false; bool is_reshaped = false;
/* sync_thread should be unregistered, collect result */ /* resync has finished, collect result */
md_unregister_thread(mddev, &mddev->sync_thread);
atomic_inc(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) { mddev->degraded != mddev->raid_disks) {
@ -9490,7 +9534,6 @@ void md_reap_sync_thread(struct mddev *mddev)
if (mddev_is_clustered(mddev) && is_reshaped if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags)) && !test_bit(MD_CLOSING, &mddev->flags))
md_cluster_ops->update_size(mddev, old_dev_sectors); md_cluster_ops->update_size(mddev, old_dev_sectors);
wake_up(&resync_wait);
/* flag recovery needed just to double check */ /* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed); sysfs_notify_dirent_safe(mddev->sysfs_completed);
@ -9498,6 +9541,7 @@ void md_reap_sync_thread(struct mddev *mddev)
md_new_event(); md_new_event();
if (mddev->event_work.func) if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work); queue_work(md_misc_wq, &mddev->event_work);
wake_up(&resync_wait);
} }
EXPORT_SYMBOL(md_reap_sync_thread); EXPORT_SYMBOL(md_reap_sync_thread);

View File

@ -510,7 +510,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes * metadata and bitmap writes
*/ */
struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ struct bio_set io_clone_set;
/* Generic flush handling. /* Generic flush handling.
* The last to finish preflush schedules a worker to submit * The last to finish preflush schedules a worker to submit
@ -535,6 +535,11 @@ struct mddev {
*/ */
struct list_head deleting; struct list_head deleting;
/* Used to synchronize idle and frozen for action_store() */
struct mutex sync_mutex;
/* The sequence number for sync thread */
atomic_t sync_seq;
bool has_superblocks:1; bool has_superblocks:1;
bool fail_last_dev:1; bool fail_last_dev:1;
bool serialize_policy:1; bool serialize_policy:1;
@ -731,7 +736,7 @@ struct md_thread {
void *private; void *private;
}; };
struct md_io_acct { struct md_io_clone {
struct mddev *mddev; struct mddev *mddev;
struct bio *orig_bio; struct bio *orig_bio;
unsigned long start_time; unsigned long start_time;
@ -756,7 +761,7 @@ extern struct md_thread *md_register_thread(
void (*run)(struct md_thread *thread), void (*run)(struct md_thread *thread),
struct mddev *mddev, struct mddev *mddev,
const char *name); const char *name);
extern void md_unregister_thread(struct md_thread __rcu **threadp); extern void md_unregister_thread(struct mddev *mddev, struct md_thread __rcu **threadp);
extern void md_wakeup_thread(struct md_thread __rcu *thread); extern void md_wakeup_thread(struct md_thread __rcu *thread);
extern void md_check_recovery(struct mddev *mddev); extern void md_check_recovery(struct mddev *mddev);
extern void md_reap_sync_thread(struct mddev *mddev); extern void md_reap_sync_thread(struct mddev *mddev);
@ -769,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev); extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size); struct bio *bio, sector_t start, sector_t size);
int acct_bioset_init(struct mddev *mddev);
void acct_bioset_exit(struct mddev *mddev);
void md_account_bio(struct mddev *mddev, struct bio **bio); void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);

View File

@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
struct r0conf *conf = priv; struct r0conf *conf = priv;
free_conf(mddev, conf); free_conf(mddev, conf);
acct_bioset_exit(mddev);
} }
static int raid0_run(struct mddev *mddev) static int raid0_run(struct mddev *mddev)
@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev)) if (md_check_no_bitmap(mddev))
return -EINVAL; return -EINVAL;
if (acct_bioset_init(mddev)) {
pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
return -ENOMEM;
}
/* if private is not null, we are here after takeover */ /* if private is not null, we are here after takeover */
if (mddev->private == NULL) { if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf); ret = create_strip_zones(mddev, &conf);
if (ret < 0) if (ret < 0)
goto exit_acct_set; return ret;
mddev->private = conf; mddev->private = conf;
} }
conf = mddev->private; conf = mddev->private;
@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev); ret = md_integrity_register(mddev);
if (ret) if (ret)
goto free; free_conf(mddev, conf);
return ret; return ret;
free:
free_conf(mddev, conf);
exit_acct_set:
acct_bioset_exit(mddev);
return ret;
} }
/* /*

View File

@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR; bio->bi_status = BLK_STS_IOERR;
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
bio_end_io_acct(bio, r1_bio->start_time);
bio_endio(bio); bio_endio(bio);
} }
@ -791,11 +789,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk; return best_disk;
} }
static void wake_up_barrier(struct r1conf *conf)
{
if (wq_has_sleeper(&conf->wait_barrier))
wake_up(&conf->wait_barrier);
}
static void flush_bio_list(struct r1conf *conf, struct bio *bio) static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{ {
/* flush any pending bitmap writes to disk before proceeding w/ I/O */ /* flush any pending bitmap writes to disk before proceeding w/ I/O */
raid1_prepare_flush_writes(conf->mddev->bitmap); raid1_prepare_flush_writes(conf->mddev->bitmap);
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
while (bio) { /* submit pending writes */ while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next; struct bio *next = bio->bi_next;
@ -972,7 +976,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
* In case freeze_array() is waiting for * In case freeze_array() is waiting for
* get_unqueued_pending() == extra * get_unqueued_pending() == extra
*/ */
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
/* Wait for the barrier in same barrier unit bucket to drop. */ /* Wait for the barrier in same barrier unit bucket to drop. */
/* Return false when nowait flag is set */ /* Return false when nowait flag is set */
@ -1015,7 +1019,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa
* In case freeze_array() is waiting for * In case freeze_array() is waiting for
* get_unqueued_pending() == extra * get_unqueued_pending() == extra
*/ */
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
/* Wait for array to be unfrozen */ /* Wait for array to be unfrozen */
/* Return false when nowait flag is set */ /* Return false when nowait flag is set */
@ -1044,7 +1048,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
static void _allow_barrier(struct r1conf *conf, int idx) static void _allow_barrier(struct r1conf *conf, int idx)
{ {
atomic_dec(&conf->nr_pending[idx]); atomic_dec(&conf->nr_pending[idx]);
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
} }
static void allow_barrier(struct r1conf *conf, sector_t sector_nr) static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
@ -1173,7 +1177,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending); bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
kfree(plug); kfree(plug);
return; return;
@ -1303,10 +1307,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
} }
r1_bio->read_disk = rdisk; r1_bio->read_disk = rdisk;
if (!r1bio_existed) {
if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) md_account_bio(mddev, &bio);
r1_bio->start_time = bio_start_io_acct(bio); r1_bio->master_bio = bio;
}
read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp, read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
&mddev->bio_set); &mddev->bio_set);
@ -1500,8 +1504,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sectors = max_sectors; r1_bio->sectors = max_sectors;
} }
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) md_account_bio(mddev, &bio);
r1_bio->start_time = bio_start_io_acct(bio); r1_bio->master_bio = bio;
atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0); atomic_set(&r1_bio->behind_remaining, 0);
@ -1576,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio_write_done(r1_bio); r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */ /* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
} }
static bool raid1_make_request(struct mddev *mddev, struct bio *bio) static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
@ -1766,7 +1770,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int err = -EEXIST; int err = -EEXIST;
int mirror = 0; int mirror = 0, repl_slot = -1;
struct raid1_info *p; struct raid1_info *p;
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->raid_disks - 1;
@ -1809,17 +1813,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break; break;
} }
if (test_bit(WantReplacement, &p->rdev->flags) && if (test_bit(WantReplacement, &p->rdev->flags) &&
p[conf->raid_disks].rdev == NULL) { p[conf->raid_disks].rdev == NULL && repl_slot < 0)
/* Add this device as a replacement */ repl_slot = mirror;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
break;
}
} }
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf); print_conf(conf);
return err; return err;
} }
@ -1829,6 +1837,10 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct r1conf *conf = mddev->private; struct r1conf *conf = mddev->private;
int err = 0; int err = 0;
int number = rdev->raid_disk; int number = rdev->raid_disk;
if (unlikely(number >= conf->raid_disks))
goto abort;
struct raid1_info *p = conf->mirrors + number; struct raid1_info *p = conf->mirrors + number;
if (rdev != p->rdev) if (rdev != p->rdev)
@ -2299,7 +2311,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d++; d++;
if (d == conf->raid_disks * 2) if (d == conf->raid_disks * 2)
d = 0; d = 0;
} while (!success && d != read_disk); } while (d != read_disk);
if (!success) { if (!success) {
/* Cannot read from anywhere - mark it bad */ /* Cannot read from anywhere - mark it bad */
@ -3144,7 +3156,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active * RAID1 needs at least one disk in active
*/ */
if (conf->raid_disks - mddev->degraded < 1) { if (conf->raid_disks - mddev->degraded < 1) {
md_unregister_thread(&conf->thread); md_unregister_thread(mddev, &conf->thread);
ret = -EINVAL; ret = -EINVAL;
goto abort; goto abort;
} }
@ -3171,7 +3183,7 @@ static int raid1_run(struct mddev *mddev)
ret = md_integrity_register(mddev); ret = md_integrity_register(mddev);
if (ret) { if (ret) {
md_unregister_thread(&mddev->thread); md_unregister_thread(mddev, &mddev->thread);
goto abort; goto abort;
} }
return 0; return 0;

View File

@ -157,7 +157,6 @@ struct r1bio {
sector_t sector; sector_t sector;
int sectors; int sectors;
unsigned long state; unsigned long state;
unsigned long start_time;
struct mddev *mddev; struct mddev *mddev;
/* /*
* original bio going to /dev/mdx * original bio going to /dev/mdx

View File

@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR; bio->bi_status = BLK_STS_IOERR;
if (r10_bio->start_time)
bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio); bio_endio(bio);
/* /*
* Wake up any possible resync thread that waits for the device * Wake up any possible resync thread that waits for the device
@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
} }
static void raid10_read_request(struct mddev *mddev, struct bio *bio, static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio) struct r10bio *r10_bio, bool io_accounting)
{ {
struct r10conf *conf = mddev->private; struct r10conf *conf = mddev->private;
struct bio *read_bio; struct bio *read_bio;
@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
} }
slot = r10_bio->read_slot; slot = r10_bio->read_slot;
if (!r10_bio->start_time && if (io_accounting) {
blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) md_account_bio(mddev, &bio);
r10_bio->start_time = bio_start_io_acct(bio); r10_bio->master_bio = bio;
}
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set); read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio; r10_bio->devs[slot].bio = read_bio;
@ -1322,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
} }
} }
static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
struct md_rdev **prrdev)
{
struct md_rdev *rdev, *rrdev;
rrdev = rcu_dereference(mirror->replacement);
/*
* Read replacement first to prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(mirror->rdev);
if (rdev == rrdev)
rrdev = NULL;
*prrdev = rrdev;
return rdev;
}
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{ {
int i; int i;
@ -1332,11 +1350,9 @@ retry_wait:
blocked_rdev = NULL; blocked_rdev = NULL;
rcu_read_lock(); rcu_read_lock();
for (i = 0; i < conf->copies; i++) { for (i = 0; i < conf->copies; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); struct md_rdev *rdev, *rrdev;
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[i].replacement); rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev; blocked_rdev = rdev;
@ -1465,15 +1481,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int d = r10_bio->devs[i].devnum; int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev; struct md_rdev *rdev, *rrdev;
rrdev = rcu_dereference(conf->mirrors[d].replacement); rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
/*
* Read replacement first to prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev)
rrdev = NULL;
if (rdev && (test_bit(Faulty, &rdev->flags))) if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL; rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags))) if (rrdev && (test_bit(Faulty, &rrdev->flags)))
@ -1543,8 +1551,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio; r10_bio->master_bio = bio;
} }
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) md_account_bio(mddev, &bio);
r10_bio->start_time = bio_start_io_acct(bio); r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1); atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
@ -1571,12 +1579,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0; r10_bio->state = 0;
r10_bio->read_slot = -1; r10_bio->read_slot = -1;
r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks); conf->geo.raid_disks);
if (bio_data_dir(bio) == READ) if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio); raid10_read_request(mddev, bio, r10_bio, true);
else else
raid10_write_request(mddev, bio, r10_bio); raid10_write_request(mddev, bio, r10_bio);
} }
@ -1780,10 +1787,9 @@ retry_discard:
*/ */
rcu_read_lock(); rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) { for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); struct md_rdev *rdev, *rrdev;
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[disk].replacement);
rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
r10_bio->devs[disk].bio = NULL; r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL; r10_bio->devs[disk].repl_bio = NULL;
@ -2720,10 +2726,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
{ {
int sect = 0; /* Offset from r10_bio->sector */ int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors; int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
struct md_rdev *rdev; struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors); int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum; int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot /* still own a reference to this rdev, so it cannot
* have been cleared recently. * have been cleared recently.
@ -2744,13 +2750,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
pr_notice("md/raid10:%s: %pg: Failing raid device\n", pr_notice("md/raid10:%s: %pg: Failing raid device\n",
mdname(mddev), rdev->bdev); mdname(mddev), rdev->bdev);
md_error(mddev, rdev); md_error(mddev, rdev);
r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; r10_bio->devs[slot].bio = IO_BLOCKED;
return; return;
} }
while(sectors) { while(sectors) {
int s = sectors; int s = sectors;
int sl = r10_bio->read_slot; int sl = slot;
int success = 0; int success = 0;
int start; int start;
@ -2785,7 +2791,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl++; sl++;
if (sl == conf->copies) if (sl == conf->copies)
sl = 0; sl = 0;
} while (!success && sl != r10_bio->read_slot); } while (sl != slot);
rcu_read_unlock(); rcu_read_unlock();
if (!success) { if (!success) {
@ -2793,16 +2799,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
* as bad on the first device to discourage future * as bad on the first device to discourage future
* reads. * reads.
*/ */
int dn = r10_bio->devs[r10_bio->read_slot].devnum; int dn = r10_bio->devs[slot].devnum;
rdev = conf->mirrors[dn].rdev; rdev = conf->mirrors[dn].rdev;
if (!rdev_set_badblocks( if (!rdev_set_badblocks(
rdev, rdev,
r10_bio->devs[r10_bio->read_slot].addr r10_bio->devs[slot].addr
+ sect, + sect,
s, 0)) { s, 0)) {
md_error(mddev, rdev); md_error(mddev, rdev);
r10_bio->devs[r10_bio->read_slot].bio r10_bio->devs[slot].bio
= IO_BLOCKED; = IO_BLOCKED;
} }
break; break;
@ -2811,7 +2817,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
start = sl; start = sl;
/* write it back and re-read */ /* write it back and re-read */
rcu_read_lock(); rcu_read_lock();
while (sl != r10_bio->read_slot) { while (sl != slot) {
if (sl==0) if (sl==0)
sl = conf->copies; sl = conf->copies;
sl--; sl--;
@ -2845,7 +2851,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rcu_read_lock(); rcu_read_lock();
} }
sl = start; sl = start;
while (sl != r10_bio->read_slot) { while (sl != slot) {
if (sl==0) if (sl==0)
sl = conf->copies; sl = conf->copies;
sl--; sl--;
@ -2985,7 +2991,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
r10_bio->state = 0; r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio); raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
/* /*
* allow_barrier after re-submit to ensure no sync io * allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending. * can be issued while regular io pending.
@ -4314,7 +4320,7 @@ static int raid10_run(struct mddev *mddev)
return 0; return 0;
out_free_conf: out_free_conf:
md_unregister_thread(&mddev->thread); md_unregister_thread(mddev, &mddev->thread);
raid10_free_conf(conf); raid10_free_conf(conf);
mddev->private = NULL; mddev->private = NULL;
out: out:
@ -4411,7 +4417,6 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
rdev->new_raid_disk = rdev->raid_disk * 2; rdev->new_raid_disk = rdev->raid_disk * 2;
rdev->sectors = size; rdev->sectors = size;
} }
WRITE_ONCE(conf->barrier, 1);
} }
return conf; return conf;

View File

@ -123,7 +123,6 @@ struct r10bio {
sector_t sector; /* virtual sector number */ sector_t sector; /* virtual sector number */
int sectors; int sectors;
unsigned long state; unsigned long state;
unsigned long start_time;
struct mddev *mddev; struct mddev *mddev;
/* /*
* original bio going to /dev/mdx * original bio going to /dev/mdx

View File

@ -1260,14 +1260,13 @@ static void r5l_log_flush_endio(struct bio *bio)
if (bio->bi_status) if (bio->bi_status)
md_error(log->rdev->mddev, log->rdev); md_error(log->rdev->mddev, log->rdev);
bio_uninit(bio);
spin_lock_irqsave(&log->io_list_lock, flags); spin_lock_irqsave(&log->io_list_lock, flags);
list_for_each_entry(io, &log->flushing_ios, log_sibling) list_for_each_entry(io, &log->flushing_ios, log_sibling)
r5l_io_run_stripes(io); r5l_io_run_stripes(io);
list_splice_tail_init(&log->flushing_ios, &log->finished_ios); list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
spin_unlock_irqrestore(&log->io_list_lock, flags); spin_unlock_irqrestore(&log->io_list_lock, flags);
bio_uninit(bio);
} }
/* /*
@ -3168,12 +3167,15 @@ void r5l_exit_log(struct r5conf *conf)
{ {
struct r5l_log *log = conf->log; struct r5l_log *log = conf->log;
/* Ensure disable_writeback_work wakes up and exits */ md_unregister_thread(conf->mddev, &log->reclaim_thread);
/*
* 'reconfig_mutex' is held by caller, set 'confg->log' to NULL to
* ensure disable_writeback_work wakes up and exits.
*/
conf->log = NULL;
wake_up(&conf->mddev->sb_wait); wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work); flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
conf->log = NULL;
mempool_exit(&log->meta_pool); mempool_exit(&log->meta_pool);
bioset_exit(&log->bs); bioset_exit(&log->bs);

View File

@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/ */
static void raid5_align_endio(struct bio *bi) static void raid5_align_endio(struct bio *bi)
{ {
struct md_io_acct *md_io_acct = bi->bi_private; struct bio *raid_bi = bi->bi_private;
struct bio *raid_bi = md_io_acct->orig_bio; struct md_rdev *rdev = (void *)raid_bi->bi_next;
struct mddev *mddev; struct mddev *mddev = rdev->mddev;
struct r5conf *conf; struct r5conf *conf = mddev->private;
struct md_rdev *rdev;
blk_status_t error = bi->bi_status; blk_status_t error = bi->bi_status;
unsigned long start_time = md_io_acct->start_time;
bio_put(bi); bio_put(bi);
rdev = (void*)raid_bi->bi_next;
raid_bi->bi_next = NULL; raid_bi->bi_next = NULL;
mddev = rdev->mddev;
conf = mddev->private;
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
if (!error) { if (!error) {
if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
bio_end_io_acct(raid_bi, start_time);
bio_endio(raid_bi); bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads)) if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_quiescent);
@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t sector, end_sector, first_bad; sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx; int bad_sectors, dd_idx;
struct md_io_acct *md_io_acct;
bool did_inc; bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) { if (!in_chunk_boundary(mddev, raid_bio)) {
@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0; return 0;
} }
align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO, md_account_bio(mddev, &raid_bio);
&mddev->io_acct_set);
md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
raid_bio->bi_next = (void *)rdev; raid_bio->bi_next = (void *)rdev;
if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
md_io_acct->start_time = bio_start_io_acct(raid_bio);
md_io_acct->orig_bio = raid_bio;
align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
&mddev->bio_set);
align_bio->bi_end_io = raid5_align_endio; align_bio->bi_end_io = raid5_align_endio;
align_bio->bi_private = md_io_acct; align_bio->bi_private = raid_bio;
align_bio->bi_iter.bi_sector = sector; align_bio->bi_iter.bi_sector = sector;
/* No reshape active, so we can trust rdev->data_offset */ /* No reshape active, so we can trust rdev->data_offset */
@ -7787,19 +7774,12 @@ static int raid5_run(struct mddev *mddev)
struct md_rdev *rdev; struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL; struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0; sector_t reshape_offset = 0;
int i, ret = 0; int i;
long long min_offset_diff = 0; long long min_offset_diff = 0;
int first = 1; int first = 1;
if (acct_bioset_init(mddev)) { if (mddev_init_writes_pending(mddev) < 0)
pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
return -ENOMEM; return -ENOMEM;
}
if (mddev_init_writes_pending(mddev) < 0) {
ret = -ENOMEM;
goto exit_acct_set;
}
if (mddev->recovery_cp != MaxSector) if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@ -7830,8 +7810,7 @@ static int raid5_run(struct mddev *mddev)
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) { (mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
if (mddev->reshape_position != MaxSector) { if (mddev->reshape_position != MaxSector) {
@ -7856,15 +7835,13 @@ static int raid5_run(struct mddev *mddev)
if (journal_dev) { if (journal_dev) {
pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n", pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
if (mddev->new_level != mddev->level) { if (mddev->new_level != mddev->level) {
pr_warn("md/raid:%s: unsupported reshape required - aborting.\n", pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
old_disks = mddev->raid_disks - mddev->delta_disks; old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one /* reshape_position must be on a new-stripe boundary, and one
@ -7880,8 +7857,7 @@ static int raid5_run(struct mddev *mddev)
if (sector_div(here_new, chunk_sectors * new_data_disks)) { if (sector_div(here_new, chunk_sectors * new_data_disks)) {
pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n", pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
reshape_offset = here_new * chunk_sectors; reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */ /* here_new is the stripe we will write to */
@ -7903,8 +7879,7 @@ static int raid5_run(struct mddev *mddev)
else if (mddev->ro == 0) { else if (mddev->ro == 0) {
pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n", pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
} else if (mddev->reshape_backwards } else if (mddev->reshape_backwards
? (here_new * chunk_sectors + min_offset_diff <= ? (here_new * chunk_sectors + min_offset_diff <=
@ -7914,8 +7889,7 @@ static int raid5_run(struct mddev *mddev)
/* Reading from the same stripe as writing to - bad */ /* Reading from the same stripe as writing to - bad */
pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n", pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
mdname(mddev)); mdname(mddev));
ret = -EINVAL; return -EINVAL;
goto exit_acct_set;
} }
pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev)); pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */ /* OK, we should be able to continue; */
@ -7939,10 +7913,8 @@ static int raid5_run(struct mddev *mddev)
else else
conf = mddev->private; conf = mddev->private;
if (IS_ERR(conf)) { if (IS_ERR(conf))
ret = PTR_ERR(conf); return PTR_ERR(conf);
goto exit_acct_set;
}
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) { if (!journal_dev) {
@ -8135,15 +8107,12 @@ static int raid5_run(struct mddev *mddev)
return 0; return 0;
abort: abort:
md_unregister_thread(&mddev->thread); md_unregister_thread(mddev, &mddev->thread);
print_raid5_conf(conf); print_raid5_conf(conf);
free_conf(conf); free_conf(conf);
mddev->private = NULL; mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
ret = -EIO; return -EIO;
exit_acct_set:
acct_bioset_exit(mddev);
return ret;
} }
static void raid5_free(struct mddev *mddev, void *priv) static void raid5_free(struct mddev *mddev, void *priv)
@ -8151,7 +8120,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
struct r5conf *conf = priv; struct r5conf *conf = priv;
free_conf(conf); free_conf(conf);
acct_bioset_exit(mddev);
mddev->to_remove = &raid5_attrs_group; mddev->to_remove = &raid5_attrs_group;
} }

View File

@ -118,7 +118,6 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
goto out_free_meta; goto out_free_meta;
} }
bip->bip_iter.bi_size = len;
bip->bip_iter.bi_sector = seed; bip->bip_iter.bi_sector = seed;
ret = bio_integrity_add_page(bio, virt_to_page(buf), len, ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
offset_in_page(buf)); offset_in_page(buf));

View File

@ -206,12 +206,11 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
return PTR_ERR(bip); return PTR_ERR(bip);
} }
bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
/* virtual start sector must be in integrity interval units */ /* virtual start sector must be in integrity interval units */
bip_set_seed(bip, bio->bi_iter.bi_sector >> bip_set_seed(bip, bio->bi_iter.bi_sector >>
(bi->interval_exp - SECTOR_SHIFT)); (bi->interval_exp - SECTOR_SHIFT));
resid = bip->bip_iter.bi_size; resid = bio_integrity_bytes(bi, bio_sectors(bio));
while (resid > 0 && sg_miter_next(miter)) { while (resid > 0 && sg_miter_next(miter)) {
len = min_t(size_t, miter->length, resid); len = min_t(size_t, miter->length, resid);
rc = bio_integrity_add_page(bio, miter->page, len, rc = bio_integrity_add_page(bio, miter->page, len,

View File

@ -300,11 +300,6 @@ void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd)
cmd->budget_token = -1; cmd->budget_token = -1;
} }
static void scsi_kick_queue(struct request_queue *q)
{
blk_mq_run_hw_queues(q, false);
}
/* /*
* Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with * Kick the queue of SCSI device @sdev if @sdev != current_sdev. Called with
* interrupts disabled. * interrupts disabled.
@ -340,7 +335,8 @@ static void scsi_single_lun_run(struct scsi_device *current_sdev)
* but in most cases, we will be first. Ideally, each LU on the * but in most cases, we will be first. Ideally, each LU on the
* target would get some limited time or requests on the target. * target would get some limited time or requests on the target.
*/ */
scsi_kick_queue(current_sdev->request_queue); blk_mq_run_hw_queues(current_sdev->request_queue,
shost->queuecommand_may_block);
spin_lock_irqsave(shost->host_lock, flags); spin_lock_irqsave(shost->host_lock, flags);
if (!starget->starget_sdev_user) if (!starget->starget_sdev_user)
@ -427,7 +423,7 @@ static void scsi_starved_list_run(struct Scsi_Host *shost)
continue; continue;
spin_unlock_irqrestore(shost->host_lock, flags); spin_unlock_irqrestore(shost->host_lock, flags);
scsi_kick_queue(slq); blk_mq_run_hw_queues(slq, false);
blk_put_queue(slq); blk_put_queue(slq);
spin_lock_irqsave(shost->host_lock, flags); spin_lock_irqsave(shost->host_lock, flags);
@ -452,8 +448,8 @@ static void scsi_run_queue(struct request_queue *q)
if (!list_empty(&sdev->host->starved_list)) if (!list_empty(&sdev->host->starved_list))
scsi_starved_list_run(sdev->host); scsi_starved_list_run(sdev->host);
/* Note: blk_mq_kick_requeue_list() runs the queue asynchronously. */
blk_mq_kick_requeue_list(q); blk_mq_kick_requeue_list(q);
blk_mq_run_hw_queues(q, false);
} }
void scsi_requeue_run_queue(struct work_struct *work) void scsi_requeue_run_queue(struct work_struct *work)

View File

@ -689,7 +689,6 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
return PTR_ERR(bip); return PTR_ERR(bip);
} }
bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
/* virtual start sector must be in integrity interval units */ /* virtual start sector must be in integrity interval units */
bip_set_seed(bip, bio->bi_iter.bi_sector >> bip_set_seed(bip, bio->bi_iter.bi_sector >>
(bi->interval_exp - SECTOR_SHIFT)); (bi->interval_exp - SECTOR_SHIFT));
@ -697,7 +696,7 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio,
pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size, pr_debug("IBLOCK BIP Size: %u Sector: %llu\n", bip->bip_iter.bi_size,
(unsigned long long)bip->bip_iter.bi_sector); (unsigned long long)bip->bip_iter.bi_sector);
resid = bip->bip_iter.bi_size; resid = bio_integrity_bytes(bi, bio_sectors(bio));
while (resid > 0 && sg_miter_next(miter)) { while (resid > 0 && sg_miter_next(miter)) {
len = min_t(size_t, miter->length, resid); len = min_t(size_t, miter->length, resid);

View File

@ -18,8 +18,12 @@ config VALIDATE_FS_PARSER
config FS_IOMAP config FS_IOMAP
bool bool
config BUFFER_HEAD
bool
# old blockdev_direct_IO implementation. Use iomap for new code instead # old blockdev_direct_IO implementation. Use iomap for new code instead
config LEGACY_DIRECT_IO config LEGACY_DIRECT_IO
depends on BUFFER_HEAD
bool bool
if BLOCK if BLOCK

View File

@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \
fs_types.o fs_context.o fs_parser.o fsopen.o init.o \ fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
kernel_read_file.o mnt_idmapping.o remap_range.o kernel_read_file.o mnt_idmapping.o remap_range.o
obj-$(CONFIG_BLOCK) += buffer.o mpage.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o
obj-$(CONFIG_PROC_FS) += proc_namespace.o obj-$(CONFIG_PROC_FS) += proc_namespace.o
obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o obj-$(CONFIG_LEGACY_DIRECT_IO) += direct-io.o
obj-y += notify/ obj-y += notify/

View File

@ -2,6 +2,7 @@
config ADFS_FS config ADFS_FS
tristate "ADFS file system support" tristate "ADFS file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
The Acorn Disc Filing System is the standard file system of the The Acorn Disc Filing System is the standard file system of the
RiscOS operating system which runs on Acorn's ARM-based Risc PC RiscOS operating system which runs on Acorn's ARM-based Risc PC

View File

@ -2,6 +2,7 @@
config AFFS_FS config AFFS_FS
tristate "Amiga FFS file system support" tristate "Amiga FFS file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help
The Fast File System (FFS) is the common file system used on hard The Fast File System (FFS) is the common file system used on hard

View File

@ -2,6 +2,7 @@
config BEFS_FS config BEFS_FS
tristate "BeOS file system (BeFS) support (read only)" tristate "BeOS file system (BeFS) support (read only)"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select NLS select NLS
help help
The BeOS File System (BeFS) is the native file system of Be, Inc's The BeOS File System (BeFS) is the native file system of Be, Inc's

View File

@ -2,6 +2,7 @@
config BFS_FS config BFS_FS
tristate "BFS file system support" tristate "BFS file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
Boot File System (BFS) is a file system used under SCO UnixWare to Boot File System (BFS) is a file system used under SCO UnixWare to
allow the bootloader access to the kernel image and other important allow the bootloader access to the kernel image and other important

View File

@ -562,12 +562,6 @@ repeat:
return err; return err;
} }
void emergency_thaw_bdev(struct super_block *sb)
{
while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
}
/** /**
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written * @mapping: the mapping which wants those buffers written

View File

@ -2,6 +2,7 @@
config EFS_FS config EFS_FS
tristate "EFS file system support (read only)" tristate "EFS file system support (read only)"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
EFS is an older file system used for non-ISO9660 CD-ROMs and hard EFS is an older file system used for non-ISO9660 CD-ROMs and hard
disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer

View File

@ -2,6 +2,7 @@
config EXFAT_FS config EXFAT_FS
tristate "exFAT filesystem support" tristate "exFAT filesystem support"
select BUFFER_HEAD
select NLS select NLS
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config EXT2_FS config EXT2_FS
tristate "Second extended fs support" tristate "Second extended fs support"
select BUFFER_HEAD
select FS_IOMAP select FS_IOMAP
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -28,6 +28,7 @@ config EXT3_FS_SECURITY
config EXT4_FS config EXT4_FS
tristate "The Extended 4 (ext4) filesystem" tristate "The Extended 4 (ext4) filesystem"
select BUFFER_HEAD
select JBD2 select JBD2
select CRC16 select CRC16
select CRYPTO select CRYPTO

View File

@ -6140,7 +6140,7 @@ retry_alloc:
if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry_alloc; goto retry_alloc;
out_ret: out_ret:
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
out: out:
filemap_invalidate_unlock_shared(mapping); filemap_invalidate_unlock_shared(mapping);
sb_end_pagefault(inode->i_sb); sb_end_pagefault(inode->i_sb);

View File

@ -2,6 +2,7 @@
config F2FS_FS config F2FS_FS
tristate "F2FS filesystem support" tristate "F2FS filesystem support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select NLS select NLS
select CRYPTO select CRYPTO
select CRYPTO_CRC32 select CRYPTO_CRC32

View File

@ -159,7 +159,7 @@ out_sem:
sb_end_pagefault(inode->i_sb); sb_end_pagefault(inode->i_sb);
err: err:
return block_page_mkwrite_return(err); return vmf_fs_error(err);
} }
static const struct vm_operations_struct f2fs_file_vm_ops = { static const struct vm_operations_struct f2fs_file_vm_ops = {

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config FAT_FS config FAT_FS
tristate tristate
select BUFFER_HEAD
select NLS select NLS
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -2,6 +2,7 @@
config VXFS_FS config VXFS_FS
tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
FreeVxFS is a file system driver that support the VERITAS VxFS(TM) FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system file system format. VERITAS VxFS(TM) is the standard file system

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config GFS2_FS config GFS2_FS
tristate "GFS2 file system support" tristate "GFS2 file system support"
select BUFFER_HEAD
select FS_POSIX_ACL select FS_POSIX_ACL
select CRC32 select CRC32
select LIBCRC32C select LIBCRC32C

View File

@ -432,7 +432,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
err = gfs2_glock_nq(&gh); err = gfs2_glock_nq(&gh);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_uninit; goto out_uninit;
} }
@ -474,7 +474,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
err = gfs2_rindex_update(sdp); err = gfs2_rindex_update(sdp);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_unlock; goto out_unlock;
} }
@ -482,12 +482,12 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
ap.target = data_blocks + ind_blocks; ap.target = data_blocks + ind_blocks;
err = gfs2_quota_lock_check(ip, &ap); err = gfs2_quota_lock_check(ip, &ap);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_unlock; goto out_unlock;
} }
err = gfs2_inplace_reserve(ip, &ap); err = gfs2_inplace_reserve(ip, &ap);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_quota_unlock; goto out_quota_unlock;
} }
@ -500,7 +500,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
} }
err = gfs2_trans_begin(sdp, rblocks, 0); err = gfs2_trans_begin(sdp, rblocks, 0);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_trans_fail; goto out_trans_fail;
} }
@ -508,7 +508,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
if (gfs2_is_stuffed(ip)) { if (gfs2_is_stuffed(ip)) {
err = gfs2_unstuff_dinode(ip); err = gfs2_unstuff_dinode(ip);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_trans_end; goto out_trans_end;
} }
} }
@ -524,7 +524,7 @@ static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf)
err = gfs2_allocate_page_backing(page, length); err = gfs2_allocate_page_backing(page, length);
if (err) if (err)
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
out_page_locked: out_page_locked:
if (ret != VM_FAULT_LOCKED) if (ret != VM_FAULT_LOCKED)
@ -558,7 +558,7 @@ static vm_fault_t gfs2_fault(struct vm_fault *vmf)
gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
err = gfs2_glock_nq(&gh); err = gfs2_glock_nq(&gh);
if (err) { if (err) {
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_uninit; goto out_uninit;
} }
ret = filemap_fault(vmf); ret = filemap_fault(vmf);

View File

@ -2,6 +2,7 @@
config HFS_FS config HFS_FS
tristate "Apple Macintosh file system support" tristate "Apple Macintosh file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select NLS select NLS
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -2,6 +2,7 @@
config HFSPLUS_FS config HFSPLUS_FS
tristate "Apple Extended HFS file system support" tristate "Apple Extended HFS file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select NLS select NLS
select NLS_UTF8 select NLS_UTF8
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO

View File

@ -2,6 +2,7 @@
config HPFS_FS config HPFS_FS
tristate "OS/2 HPFS file system support" tristate "OS/2 HPFS file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select FS_IOMAP select FS_IOMAP
help help
OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS

View File

@ -23,16 +23,10 @@ struct mnt_idmap;
*/ */
#ifdef CONFIG_BLOCK #ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void); extern void __init bdev_cache_init(void);
void emergency_thaw_bdev(struct super_block *sb);
#else #else
static inline void bdev_cache_init(void) static inline void bdev_cache_init(void)
{ {
} }
static inline int emergency_thaw_bdev(struct super_block *sb)
{
return 0;
}
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BLOCK */
/* /*

View File

@ -1286,7 +1286,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
return VM_FAULT_LOCKED; return VM_FAULT_LOCKED;
out_unlock: out_unlock:
folio_unlock(folio); folio_unlock(folio);
return block_page_mkwrite_return(ret); return vmf_fs_error(ret);
} }
EXPORT_SYMBOL_GPL(iomap_page_mkwrite); EXPORT_SYMBOL_GPL(iomap_page_mkwrite);

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config ISO9660_FS config ISO9660_FS
tristate "ISO 9660 CDROM file system support" tristate "ISO 9660 CDROM file system support"
select BUFFER_HEAD
help help
This is the standard file system used on CD-ROMs. It was previously This is the standard file system used on CD-ROMs. It was previously
known as "High Sierra File System" and is called "hsfs" on other known as "High Sierra File System" and is called "hsfs" on other

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config JFS_FS config JFS_FS
tristate "JFS filesystem support" tristate "JFS filesystem support"
select BUFFER_HEAD
select NLS select NLS
select CRC32 select CRC32
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO

View File

@ -2,6 +2,7 @@
config MINIX_FS config MINIX_FS
tristate "Minix file system support" tristate "Minix file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
Minix is a simple operating system used in many classes about OS's. Minix is a simple operating system used in many classes about OS's.
The minix file system (method to organize files on a hard disk The minix file system (method to organize files on a hard disk

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config NILFS2_FS config NILFS2_FS
tristate "NILFS2 file system support" tristate "NILFS2 file system support"
select BUFFER_HEAD
select CRC32 select CRC32
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -108,7 +108,7 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
wait_for_stable_page(page); wait_for_stable_page(page);
out: out:
sb_end_pagefault(inode->i_sb); sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret); return vmf_fs_error(ret);
} }
static const struct vm_operations_struct nilfs_file_vm_ops = { static const struct vm_operations_struct nilfs_file_vm_ops = {

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config NTFS_FS config NTFS_FS
tristate "NTFS file system support" tristate "NTFS file system support"
select BUFFER_HEAD
select NLS select NLS
help help
NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config NTFS3_FS config NTFS3_FS
tristate "NTFS Read-Write file system support" tristate "NTFS Read-Write file system support"
select BUFFER_HEAD
select NLS select NLS
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -2,6 +2,7 @@
config OCFS2_FS config OCFS2_FS
tristate "OCFS2 file system support" tristate "OCFS2 file system support"
depends on INET && SYSFS && CONFIGFS_FS depends on INET && SYSFS && CONFIGFS_FS
select BUFFER_HEAD
select JBD2 select JBD2
select CRC32 select CRC32
select QUOTA select QUOTA

View File

@ -2,6 +2,7 @@
config OMFS_FS config OMFS_FS
tristate "SonicBlue Optimized MPEG File System support" tristate "SonicBlue Optimized MPEG File System support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
select CRC_ITU_T select CRC_ITU_T
help help
This is the proprietary file system used by the Rio Karma music This is the proprietary file system used by the Rio Karma music

View File

@ -2,6 +2,7 @@
config QNX4FS_FS config QNX4FS_FS
tristate "QNX4 file system support (read only)" tristate "QNX4 file system support (read only)"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
This is the file system used by the real-time operating systems This is the file system used by the real-time operating systems
QNX 4 and QNX 6 (the latter is also called QNX RTP). QNX 4 and QNX 6 (the latter is also called QNX RTP).

View File

@ -2,6 +2,7 @@
config QNX6FS_FS config QNX6FS_FS
tristate "QNX6 file system support (read only)" tristate "QNX6 file system support (read only)"
depends on BLOCK && CRC32 depends on BLOCK && CRC32
select BUFFER_HEAD
help help
This is the file system used by the real-time operating systems This is the file system used by the real-time operating systems
QNX 6 (also called QNX RTP). QNX 6 (also called QNX RTP).

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config REISERFS_FS config REISERFS_FS
tristate "Reiserfs support (deprecated)" tristate "Reiserfs support (deprecated)"
select BUFFER_HEAD
select CRC32 select CRC32
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO
help help

View File

@ -57,6 +57,7 @@ endchoice
config ROMFS_ON_BLOCK config ROMFS_ON_BLOCK
bool bool
default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
select BUFFER_HEAD
config ROMFS_ON_MTD config ROMFS_ON_MTD
bool bool

View File

@ -1029,7 +1029,9 @@ static void do_thaw_all_callback(struct super_block *sb)
{ {
down_write(&sb->s_umount); down_write(&sb->s_umount);
if (sb->s_root && sb->s_flags & SB_BORN) { if (sb->s_root && sb->s_flags & SB_BORN) {
emergency_thaw_bdev(sb); if (IS_ENABLED(CONFIG_BLOCK))
while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
thaw_super_locked(sb); thaw_super_locked(sb);
} else { } else {
up_write(&sb->s_umount); up_write(&sb->s_umount);

View File

@ -2,6 +2,7 @@
config SYSV_FS config SYSV_FS
tristate "System V/Xenix/V7/Coherent file system support" tristate "System V/Xenix/V7/Coherent file system support"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
SCO, Xenix and Coherent are commercial Unix systems for Intel SCO, Xenix and Coherent are commercial Unix systems for Intel
machines, and Version 7 was used on the DEC PDP-11. Saying Y machines, and Version 7 was used on the DEC PDP-11. Saying Y

View File

@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
config UDF_FS config UDF_FS
tristate "UDF file system support" tristate "UDF file system support"
select BUFFER_HEAD
select CRC_ITU_T select CRC_ITU_T
select NLS select NLS
select LEGACY_DIRECT_IO select LEGACY_DIRECT_IO

View File

@ -67,7 +67,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf)
err = block_commit_write(page, 0, end); err = block_commit_write(page, 0, end);
if (err < 0) { if (err < 0) {
unlock_page(page); unlock_page(page);
ret = block_page_mkwrite_return(err); ret = vmf_fs_error(err);
goto out_unlock; goto out_unlock;
} }
out_dirty: out_dirty:

View File

@ -2,6 +2,7 @@
config UFS_FS config UFS_FS
tristate "UFS file system support (read only)" tristate "UFS file system support (read only)"
depends on BLOCK depends on BLOCK
select BUFFER_HEAD
help help
BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD, BSD and derivate versions of Unix (such as SunOS, FreeBSD, NetBSD,
OpenBSD and NeXTstep) use a file system called UFS. Some System V OpenBSD and NeXTstep) use a file system called UFS. Some System V

View File

@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio); extern void bio_free_pages(struct bio *bio);
void guard_bio_eod(struct bio *bio); void guard_bio_eod(struct bio *bio);
void zero_fill_bio(struct bio *bio); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
static inline void zero_fill_bio(struct bio *bio)
{
zero_fill_bio_iter(bio, bio->bi_iter);
}
static inline void bio_release_pages(struct bio *bio, bool mark_dirty) static inline void bio_release_pages(struct bio *bio, bool mark_dirty)
{ {

View File

@ -178,14 +178,10 @@ struct request {
struct { struct {
unsigned int seq; unsigned int seq;
struct list_head list;
rq_end_io_fn *saved_end_io; rq_end_io_fn *saved_end_io;
} flush; } flush;
union { u64 fifo_time;
struct __call_single_data csd;
u64 fifo_time;
};
/* /*
* completion callback. * completion callback.

View File

@ -538,6 +538,7 @@ struct request_queue {
#define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */
#define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */ #define QUEUE_FLAG_SYNCHRONOUS 11 /* always completes in submit context */
#define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */
#define QUEUE_FLAG_HW_WC 18 /* Write back caching supported */
#define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */
#define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */
#define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */
@ -846,6 +847,7 @@ extern const char *blk_op_str(enum req_op op);
int blk_status_to_errno(blk_status_t status); int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno); blk_status_t errno_to_blk_status(int errno);
const char *blk_status_to_str(blk_status_t status);
/* only poll the hardware once, don't continue until a completion was found */ /* only poll the hardware once, don't continue until a completion was found */
#define BLK_POLL_ONESHOT (1 << 0) #define BLK_POLL_ONESHOT (1 << 0)

View File

@ -16,8 +16,6 @@
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#ifdef CONFIG_BLOCK
enum bh_state_bits { enum bh_state_bits {
BH_Uptodate, /* Contains valid data */ BH_Uptodate, /* Contains valid data */
BH_Dirty, /* Is dirty */ BH_Dirty, /* Is dirty */
@ -198,7 +196,6 @@ void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset); struct page *page, unsigned long offset);
void folio_set_bh(struct buffer_head *bh, struct folio *folio, void folio_set_bh(struct buffer_head *bh, struct folio *folio,
unsigned long offset); unsigned long offset);
bool try_to_free_buffers(struct folio *);
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
bool retry); bool retry);
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
@ -213,10 +210,6 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate);
/* Things to do with buffers at mapping->private_list */ /* Things to do with buffers at mapping->private_list */
void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode); void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
int inode_has_buffers(struct inode *);
void invalidate_inode_buffers(struct inode *);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end, int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
bool datasync); bool datasync);
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end, int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
@ -240,9 +233,6 @@ void __bforget(struct buffer_head *);
void __breadahead(struct block_device *, sector_t block, unsigned int size); void __breadahead(struct block_device *, sector_t block, unsigned int size);
struct buffer_head *__bread_gfp(struct block_device *, struct buffer_head *__bread_gfp(struct block_device *,
sector_t block, unsigned size, gfp_t gfp); sector_t block, unsigned size, gfp_t gfp);
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
void free_buffer_head(struct buffer_head * bh); void free_buffer_head(struct buffer_head * bh);
void unlock_buffer(struct buffer_head *bh); void unlock_buffer(struct buffer_head *bh);
@ -258,8 +248,6 @@ int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
void __bh_read_batch(int nr, struct buffer_head *bhs[], void __bh_read_batch(int nr, struct buffer_head *bhs[],
blk_opf_t op_flags, bool force_lock); blk_opf_t op_flags, bool force_lock);
extern int buffer_heads_over_limit;
/* /*
* Generic address_space_operations implementations for buffer_head-backed * Generic address_space_operations implementations for buffer_head-backed
* address_spaces. * address_spaces.
@ -291,18 +279,6 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size);
int block_commit_write(struct page *page, unsigned from, unsigned to); int block_commit_write(struct page *page, unsigned from, unsigned to);
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block); get_block_t get_block);
/* Convert errno to return value from ->page_mkwrite() call */
static inline vm_fault_t block_page_mkwrite_return(int err)
{
if (err == 0)
return VM_FAULT_LOCKED;
if (err == -EFAULT || err == -EAGAIN)
return VM_FAULT_NOPAGE;
if (err == -ENOMEM)
return VM_FAULT_OOM;
/* -ENOSPC, -EDQUOT, -EIO ... */
return VM_FAULT_SIGBUS;
}
sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *);
int block_truncate_page(struct address_space *, loff_t, get_block_t *); int block_truncate_page(struct address_space *, loff_t, get_block_t *);
@ -316,8 +292,6 @@ extern int buffer_migrate_folio_norefs(struct address_space *,
#define buffer_migrate_folio_norefs NULL #define buffer_migrate_folio_norefs NULL
#endif #endif
void buffer_init(void);
/* /*
* inline definitions * inline definitions
*/ */
@ -477,7 +451,20 @@ __bread(struct block_device *bdev, sector_t block, unsigned size)
bool block_dirty_folio(struct address_space *mapping, struct folio *folio); bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
#else /* CONFIG_BLOCK */ #ifdef CONFIG_BUFFER_HEAD
void buffer_init(void);
bool try_to_free_buffers(struct folio *folio);
int inode_has_buffers(struct inode *inode);
void invalidate_inode_buffers(struct inode *inode);
int remove_inode_buffers(struct inode *inode);
int sync_mapping_buffers(struct address_space *mapping);
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
extern int buffer_heads_over_limit;
#else /* CONFIG_BUFFER_HEAD */
static inline void buffer_init(void) {} static inline void buffer_init(void) {}
static inline bool try_to_free_buffers(struct folio *folio) { return true; } static inline bool try_to_free_buffers(struct folio *folio) { return true; }
@ -485,9 +472,10 @@ static inline int inode_has_buffers(struct inode *inode) { return 0; }
static inline void invalidate_inode_buffers(struct inode *inode) {} static inline void invalidate_inode_buffers(struct inode *inode) {}
static inline int remove_inode_buffers(struct inode *inode) { return 1; } static inline int remove_inode_buffers(struct inode *inode) { return 1; }
static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; } static inline int sync_mapping_buffers(struct address_space *mapping) { return 0; }
static inline void invalidate_bh_lrus(void) {}
static inline void invalidate_bh_lrus_cpu(void) {} static inline void invalidate_bh_lrus_cpu(void) {}
static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; } static inline bool has_bh_in_lru(int cpu, void *dummy) { return false; }
#define buffer_heads_over_limit 0 #define buffer_heads_over_limit 0
#endif /* CONFIG_BLOCK */ #endif /* CONFIG_BUFFER_HEAD */
#endif /* _LINUX_BUFFER_HEAD_H */ #endif /* _LINUX_BUFFER_HEAD_H */

View File

@ -58,7 +58,11 @@ struct vm_fault;
#define IOMAP_F_DIRTY (1U << 1) #define IOMAP_F_DIRTY (1U << 1)
#define IOMAP_F_SHARED (1U << 2) #define IOMAP_F_SHARED (1U << 2)
#define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_MERGED (1U << 3)
#ifdef CONFIG_BUFFER_HEAD
#define IOMAP_F_BUFFER_HEAD (1U << 4) #define IOMAP_F_BUFFER_HEAD (1U << 4)
#else
#define IOMAP_F_BUFFER_HEAD 0
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5) #define IOMAP_F_XATTR (1U << 5)
/* /*

View File

@ -3403,6 +3403,24 @@ static inline vm_fault_t vmf_error(int err)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
/*
* Convert errno to return value for ->page_mkwrite() calls.
*
* This should eventually be merged with vmf_error() above, but will need a
* careful audit of all vmf_error() callers.
*/
static inline vm_fault_t vmf_fs_error(int err)
{
if (err == 0)
return VM_FAULT_LOCKED;
if (err == -EFAULT || err == -EAGAIN)
return VM_FAULT_NOPAGE;
if (err == -ENOMEM)
return VM_FAULT_OOM;
/* -ENOSPC, -EDQUOT, -EIO ... */
return VM_FAULT_SIGBUS;
}
struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
unsigned int foll_flags); unsigned int foll_flags);

View File

@ -12,6 +12,7 @@
#define RWBS_LEN 8 #define RWBS_LEN 8
#ifdef CONFIG_BUFFER_HEAD
DECLARE_EVENT_CLASS(block_buffer, DECLARE_EVENT_CLASS(block_buffer,
TP_PROTO(struct buffer_head *bh), TP_PROTO(struct buffer_head *bh),
@ -61,6 +62,7 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer,
TP_ARGS(bh) TP_ARGS(bh)
); );
#endif /* CONFIG_BUFFER_HEAD */
/** /**
* block_rq_requeue - place block IO request back on a queue * block_rq_requeue - place block IO request back on a queue

View File

@ -31,8 +31,8 @@ TRACE_EVENT(kyber_latency,
TP_fast_assign( TP_fast_assign(
__entry->dev = dev; __entry->dev = dev;
strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strscpy(__entry->domain, domain, sizeof(__entry->domain));
strlcpy(__entry->type, type, sizeof(__entry->type)); strscpy(__entry->type, type, sizeof(__entry->type));
__entry->percentile = percentile; __entry->percentile = percentile;
__entry->numerator = numerator; __entry->numerator = numerator;
__entry->denominator = denominator; __entry->denominator = denominator;
@ -59,7 +59,7 @@ TRACE_EVENT(kyber_adjust,
TP_fast_assign( TP_fast_assign(
__entry->dev = dev; __entry->dev = dev;
strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strscpy(__entry->domain, domain, sizeof(__entry->domain));
__entry->depth = depth; __entry->depth = depth;
), ),
@ -81,7 +81,7 @@ TRACE_EVENT(kyber_throttled,
TP_fast_assign( TP_fast_assign(
__entry->dev = dev; __entry->dev = dev;
strlcpy(__entry->domain, domain, sizeof(__entry->domain)); strscpy(__entry->domain, domain, sizeof(__entry->domain));
), ),
TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev),

View File

@ -33,7 +33,7 @@ TRACE_EVENT(wbt_stat,
), ),
TP_fast_assign( TP_fast_assign(
strlcpy(__entry->name, bdi_dev_name(bdi), strscpy(__entry->name, bdi_dev_name(bdi),
ARRAY_SIZE(__entry->name)); ARRAY_SIZE(__entry->name));
__entry->rmean = stat[0].mean; __entry->rmean = stat[0].mean;
__entry->rmin = stat[0].min; __entry->rmin = stat[0].min;
@ -68,7 +68,7 @@ TRACE_EVENT(wbt_lat,
), ),
TP_fast_assign( TP_fast_assign(
strlcpy(__entry->name, bdi_dev_name(bdi), strscpy(__entry->name, bdi_dev_name(bdi),
ARRAY_SIZE(__entry->name)); ARRAY_SIZE(__entry->name));
__entry->lat = div_u64(lat, 1000); __entry->lat = div_u64(lat, 1000);
), ),
@ -105,7 +105,7 @@ TRACE_EVENT(wbt_step,
), ),
TP_fast_assign( TP_fast_assign(
strlcpy(__entry->name, bdi_dev_name(bdi), strscpy(__entry->name, bdi_dev_name(bdi),
ARRAY_SIZE(__entry->name)); ARRAY_SIZE(__entry->name));
__entry->msg = msg; __entry->msg = msg;
__entry->step = step; __entry->step = step;
@ -141,7 +141,7 @@ TRACE_EVENT(wbt_timer,
), ),
TP_fast_assign( TP_fast_assign(
strlcpy(__entry->name, bdi_dev_name(bdi), strscpy(__entry->name, bdi_dev_name(bdi),
ARRAY_SIZE(__entry->name)); ARRAY_SIZE(__entry->name));
__entry->status = status; __entry->status = status;
__entry->step = step; __entry->step = step;

View File

@ -107,20 +107,21 @@ enum {
/* /*
* Return an I/O priority value based on a class, a level and a hint. * Return an I/O priority value based on a class, a level and a hint.
*/ */
static __always_inline __u16 ioprio_value(int class, int level, int hint) static __always_inline __u16 ioprio_value(int prioclass, int priolevel,
int priohint)
{ {
if (IOPRIO_BAD_VALUE(class, IOPRIO_NR_CLASSES) || if (IOPRIO_BAD_VALUE(prioclass, IOPRIO_NR_CLASSES) ||
IOPRIO_BAD_VALUE(level, IOPRIO_NR_LEVELS) || IOPRIO_BAD_VALUE(priolevel, IOPRIO_NR_LEVELS) ||
IOPRIO_BAD_VALUE(hint, IOPRIO_NR_HINTS)) IOPRIO_BAD_VALUE(priohint, IOPRIO_NR_HINTS))
return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT; return IOPRIO_CLASS_INVALID << IOPRIO_CLASS_SHIFT;
return (class << IOPRIO_CLASS_SHIFT) | return (prioclass << IOPRIO_CLASS_SHIFT) |
(hint << IOPRIO_HINT_SHIFT) | level; (priohint << IOPRIO_HINT_SHIFT) | priolevel;
} }
#define IOPRIO_PRIO_VALUE(class, level) \ #define IOPRIO_PRIO_VALUE(prioclass, priolevel) \
ioprio_value(class, level, IOPRIO_HINT_NONE) ioprio_value(prioclass, priolevel, IOPRIO_HINT_NONE)
#define IOPRIO_PRIO_VALUE_HINT(class, level, hint) \ #define IOPRIO_PRIO_VALUE_HINT(prioclass, priolevel, priohint) \
ioprio_value(class, level, hint) ioprio_value(prioclass, priolevel, priohint)
#endif /* _UAPI_LINUX_IOPRIO_H */ #endif /* _UAPI_LINUX_IOPRIO_H */

View File

@ -176,6 +176,12 @@
/* Copy between request and user buffer by pread()/pwrite() */ /* Copy between request and user buffer by pread()/pwrite() */
#define UBLK_F_USER_COPY (1UL << 7) #define UBLK_F_USER_COPY (1UL << 7)
/*
* User space sets this flag when setting up the device to request zoned storage support. Kernel may
* deny the request by returning an error.
*/
#define UBLK_F_ZONED (1ULL << 8)
/* device state */ /* device state */
#define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_DEAD 0
#define UBLK_S_DEV_LIVE 1 #define UBLK_S_DEV_LIVE 1
@ -232,9 +238,26 @@ struct ublksrv_ctrl_dev_info {
#define UBLK_IO_OP_READ 0 #define UBLK_IO_OP_READ 0
#define UBLK_IO_OP_WRITE 1 #define UBLK_IO_OP_WRITE 1
#define UBLK_IO_OP_FLUSH 2 #define UBLK_IO_OP_FLUSH 2
#define UBLK_IO_OP_DISCARD 3 #define UBLK_IO_OP_DISCARD 3
#define UBLK_IO_OP_WRITE_SAME 4 #define UBLK_IO_OP_WRITE_SAME 4
#define UBLK_IO_OP_WRITE_ZEROES 5 #define UBLK_IO_OP_WRITE_ZEROES 5
#define UBLK_IO_OP_ZONE_OPEN 10
#define UBLK_IO_OP_ZONE_CLOSE 11
#define UBLK_IO_OP_ZONE_FINISH 12
#define UBLK_IO_OP_ZONE_APPEND 13
#define UBLK_IO_OP_ZONE_RESET 15
/*
* Construct a zone report. The report request is carried in `struct
* ublksrv_io_desc`. The `start_sector` field must be the first sector of a zone
* and shall indicate the first zone of the report. The `nr_zones` shall
* indicate how many zones should be reported at most. The report shall be
* delivered as a `struct blk_zone` array. To report fewer zones than requested,
* zero the last entry of the returned array.
*
* Related definitions(blk_zone, blk_zone_cond, blk_zone_type, ...) in
* include/uapi/linux/blkzoned.h are part of ublk UAPI.
*/
#define UBLK_IO_OP_REPORT_ZONES 18
#define UBLK_IO_F_FAILFAST_DEV (1U << 8) #define UBLK_IO_F_FAILFAST_DEV (1U << 8)
#define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9) #define UBLK_IO_F_FAILFAST_TRANSPORT (1U << 9)
@ -255,7 +278,10 @@ struct ublksrv_io_desc {
/* op: bit 0-7, flags: bit 8-31 */ /* op: bit 0-7, flags: bit 8-31 */
__u32 op_flags; __u32 op_flags;
__u32 nr_sectors; union {
__u32 nr_sectors;
__u32 nr_zones; /* for UBLK_IO_OP_REPORT_ZONES */
};
/* start sector for this io */ /* start sector for this io */
__u64 start_sector; __u64 start_sector;
@ -284,11 +310,21 @@ struct ublksrv_io_cmd {
/* io result, it is valid for COMMIT* command only */ /* io result, it is valid for COMMIT* command only */
__s32 result; __s32 result;
/* union {
* userspace buffer address in ublksrv daemon process, valid for /*
* FETCH* command only * userspace buffer address in ublksrv daemon process, valid for
*/ * FETCH* command only
__u64 addr; *
* `addr` should not be used when UBLK_F_USER_COPY is enabled,
* because userspace handles data copy by pread()/pwrite() over
* /dev/ublkcN. But in case of UBLK_F_ZONED, this union is
* re-used to pass back the allocated LBA for
* UBLK_IO_OP_ZONE_APPEND which actually depends on
* UBLK_F_USER_COPY
*/
__u64 addr;
__u64 zone_append_lba;
};
}; };
struct ublk_param_basic { struct ublk_param_basic {
@ -331,6 +367,13 @@ struct ublk_param_devt {
__u32 disk_minor; __u32 disk_minor;
}; };
struct ublk_param_zoned {
__u32 max_open_zones;
__u32 max_active_zones;
__u32 max_zone_append_sectors;
__u8 reserved[20];
};
struct ublk_params { struct ublk_params {
/* /*
* Total length of parameters, userspace has to set 'len' for both * Total length of parameters, userspace has to set 'len' for both
@ -342,11 +385,13 @@ struct ublk_params {
#define UBLK_PARAM_TYPE_BASIC (1 << 0) #define UBLK_PARAM_TYPE_BASIC (1 << 0)
#define UBLK_PARAM_TYPE_DISCARD (1 << 1) #define UBLK_PARAM_TYPE_DISCARD (1 << 1)
#define UBLK_PARAM_TYPE_DEVT (1 << 2) #define UBLK_PARAM_TYPE_DEVT (1 << 2)
#define UBLK_PARAM_TYPE_ZONED (1 << 3)
__u32 types; /* types of parameter included */ __u32 types; /* types of parameter included */
struct ublk_param_basic basic; struct ublk_param_basic basic;
struct ublk_param_discard discard; struct ublk_param_discard discard;
struct ublk_param_devt devt; struct ublk_param_devt devt;
struct ublk_param_zoned zoned;
}; };
#endif #endif

View File

@ -56,7 +56,9 @@ int main(int argc, char *argv[])
uint8_t v; uint8_t v;
uint8_t exptbl[256], invtbl[256]; uint8_t exptbl[256], invtbl[256];
printf("#ifdef __KERNEL__\n");
printf("#include <linux/export.h>\n"); printf("#include <linux/export.h>\n");
printf("#endif\n");
printf("#include <linux/raid/pq.h>\n"); printf("#include <linux/raid/pq.h>\n");
/* Compute multiplication table */ /* Compute multiplication table */

View File

@ -13,7 +13,6 @@
* the syndrome.) * the syndrome.)
*/ */
#include <linux/export.h>
#include <linux/raid/pq.h> #include <linux/raid/pq.h>
/* Recover two failed data blocks. */ /* Recover two failed data blocks. */

3
lib/raid6/test/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
/int.uc
/neon.uc
/raid6test

View File

@ -6,14 +6,15 @@
pound := \# pound := \#
CC = gcc # Adjust as desired
OPTFLAGS = -O2 # Adjust as desired CC = gcc
CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS) OPTFLAGS = -O2
LD = ld CFLAGS = -I.. -I ../../../include -g $(OPTFLAGS)
AWK = awk -f LD = ld
AR = ar AWK = awk -f
RANLIB = ranlib AR = ar
OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o RANLIB = ranlib
OBJS = int1.o int2.o int4.o int8.o int16.o int32.o recov.o algos.o tables.o
ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/) ARCH := $(shell uname -m 2>/dev/null | sed -e /s/i.86/i386/)
ifeq ($(ARCH),i386) ifeq ($(ARCH),i386)
@ -34,24 +35,25 @@ ifeq ($(ARCH),aarch64)
HAS_NEON = yes HAS_NEON = yes
endif endif
ifeq ($(findstring ppc,$(ARCH)),ppc)
CFLAGS += -I../../../arch/powerpc/include
HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\
gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
endif
ifeq ($(IS_X86),yes) ifeq ($(IS_X86),yes)
OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o OBJS += mmx.o sse1.o sse2.o avx2.o recov_ssse3.o recov_avx2.o avx512.o recov_avx512.o
CFLAGS += -DCONFIG_X86 CFLAGS += -DCONFIG_X86
CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \ CFLAGS += $(shell echo "vpmovm2b %k1, %zmm5" | \
gcc -c -x assembler - >/dev/null 2>&1 && \ gcc -c -x assembler - >/dev/null 2>&1 && \
rm ./-.o && echo -DCONFIG_AS_AVX512=1) rm ./-.o && echo -DCONFIG_AS_AVX512=1)
else ifeq ($(HAS_NEON),yes) else ifeq ($(HAS_NEON),yes)
OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o OBJS += neon.o neon1.o neon2.o neon4.o neon8.o recov_neon.o recov_neon_inner.o
CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
else else ifeq ($(HAS_ALTIVEC),yes)
HAS_ALTIVEC := $(shell printf '$(pound)include <altivec.h>\nvector int a;\n' |\ CFLAGS += -DCONFIG_ALTIVEC
gcc -c -x c - >/dev/null && rm ./-.o && echo yes) OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
ifeq ($(HAS_ALTIVEC),yes) vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
CFLAGS += -I../../../arch/powerpc/include
CFLAGS += -DCONFIG_ALTIVEC
OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o
endif
endif endif
.c.o: .c.o:
@ -63,12 +65,12 @@ endif
%.uc: ../%.uc %.uc: ../%.uc
cp -f $< $@ cp -f $< $@
all: raid6.a raid6test all: raid6.a raid6test
raid6.a: $(OBJS) raid6.a: $(OBJS)
rm -f $@ rm -f $@
$(AR) cq $@ $^ $(AR) cq $@ $^
$(RANLIB) $@ $(RANLIB) $@
raid6test: test.c raid6.a raid6test: test.c raid6.a
$(CC) $(CFLAGS) -o raid6test $^ $(CC) $(CFLAGS) -o raid6test $^

View File

@ -684,7 +684,7 @@ int migrate_folio(struct address_space *mapping, struct folio *dst,
} }
EXPORT_SYMBOL(migrate_folio); EXPORT_SYMBOL(migrate_folio);
#ifdef CONFIG_BLOCK #ifdef CONFIG_BUFFER_HEAD
/* Returns true if all buffers are successfully locked */ /* Returns true if all buffers are successfully locked */
static bool buffer_migrate_lock_buffers(struct buffer_head *head, static bool buffer_migrate_lock_buffers(struct buffer_head *head,
enum migrate_mode mode) enum migrate_mode mode)
@ -837,7 +837,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
return __buffer_migrate_folio(mapping, dst, src, mode, true); return __buffer_migrate_folio(mapping, dst, src, mode, true);
} }
EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
#endif #endif /* CONFIG_BUFFER_HEAD */
int filemap_migrate_folio(struct address_space *mapping, int filemap_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src, enum migrate_mode mode) struct folio *dst, struct folio *src, enum migrate_mode mode)

View File

@ -100,6 +100,7 @@ class IocStat:
self.period_at = ioc.period_at.value_() / 1_000_000 self.period_at = ioc.period_at.value_() / 1_000_000
self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC self.vperiod_at = ioc.period_at_vtime.value_() / VTIME_PER_SEC
self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC self.vrate_pct = ioc.vtime_base_rate.value_() * 100 / VTIME_PER_USEC
self.ivrate_pct = ioc.vtime_rate.counter.value_() * 100 / VTIME_PER_USEC
self.busy_level = ioc.busy_level.value_() self.busy_level = ioc.busy_level.value_()
self.autop_idx = ioc.autop_idx.value_() self.autop_idx = ioc.autop_idx.value_()
self.user_cost_model = ioc.user_cost_model.value_() self.user_cost_model = ioc.user_cost_model.value_()
@ -119,7 +120,9 @@ class IocStat:
'period_at' : self.period_at, 'period_at' : self.period_at,
'period_vtime_at' : self.vperiod_at, 'period_vtime_at' : self.vperiod_at,
'busy_level' : self.busy_level, 'busy_level' : self.busy_level,
'vrate_pct' : self.vrate_pct, } 'vrate_pct' : self.vrate_pct,
'ivrate_pct' : self.ivrate_pct,
}
def table_preamble_str(self): def table_preamble_str(self):
state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF' state = ('RUN' if self.running else 'IDLE') if self.enabled else 'OFF'
@ -127,7 +130,7 @@ class IocStat:
f'per={self.period_ms}ms ' \ f'per={self.period_ms}ms ' \
f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \ f'cur_per={self.period_at:.3f}:v{self.vperiod_at:.3f} ' \
f'busy={self.busy_level:+3} ' \ f'busy={self.busy_level:+3} ' \
f'vrate={self.vrate_pct:6.2f}% ' \ f'vrate={self.vrate_pct:6.2f}%:{self.ivrate_pct:6.2f}% ' \
f'params={self.autop_name}' f'params={self.autop_name}'
if self.user_cost_model or self.user_qos_params: if self.user_cost_model or self.user_qos_params:
output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})' output += f'({"C" if self.user_cost_model else ""}{"Q" if self.user_qos_params else ""})'
@ -135,7 +138,7 @@ class IocStat:
def table_header_str(self): def table_header_str(self):
return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \ return f'{"":25} active {"weight":>9} {"hweight%":>13} {"inflt%":>6} ' \
f'{"debt":>7} {"delay":>7} {"usage%"}' f'{"usage%":>6} {"wait":>7} {"debt":>7} {"delay":>7}'
class IocgStat: class IocgStat:
def __init__(self, iocg): def __init__(self, iocg):
@ -161,6 +164,8 @@ class IocgStat:
self.usage = (100 * iocg.usage_delta_us.value_() / self.usage = (100 * iocg.usage_delta_us.value_() /
ioc.period_us.value_()) if self.active else 0 ioc.period_us.value_()) if self.active else 0
self.wait_ms = (iocg.stat.wait_us.value_() -
iocg.last_stat.wait_us.value_()) / 1000
self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000
if blkg.use_delay.counter.value_() != 0: if blkg.use_delay.counter.value_() != 0:
self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000
@ -177,9 +182,10 @@ class IocgStat:
'hweight_active_pct' : self.hwa_pct, 'hweight_active_pct' : self.hwa_pct,
'hweight_inuse_pct' : self.hwi_pct, 'hweight_inuse_pct' : self.hwi_pct,
'inflight_pct' : self.inflight_pct, 'inflight_pct' : self.inflight_pct,
'usage_pct' : self.usage,
'wait_ms' : self.wait_ms,
'debt_ms' : self.debt_ms, 'debt_ms' : self.debt_ms,
'delay_ms' : self.delay_ms, 'delay_ms' : self.delay_ms,
'usage_pct' : self.usage,
'address' : self.address } 'address' : self.address }
return out return out
@ -189,9 +195,10 @@ class IocgStat:
f'{round(self.inuse):5}/{round(self.active):5} ' \ f'{round(self.inuse):5}/{round(self.active):5} ' \
f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \ f'{self.hwi_pct:6.2f}/{self.hwa_pct:6.2f} ' \
f'{self.inflight_pct:6.2f} ' \ f'{self.inflight_pct:6.2f} ' \
f'{min(self.usage, 999):6.2f} ' \
f'{self.wait_ms:7.2f} ' \
f'{self.debt_ms:7.2f} ' \ f'{self.debt_ms:7.2f} ' \
f'{self.delay_ms:7.2f} '\ f'{self.delay_ms:7.2f}'
f'{min(self.usage, 999):6.2f}'
out = out.rstrip(':') out = out.rstrip(':')
return out return out
@ -221,7 +228,7 @@ ioc = None
for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()): for i, ptr in radix_tree_for_each(blkcg_root.blkg_tree.address_of_()):
blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr) blkg = drgn.Object(prog, 'struct blkcg_gq', address=ptr)
try: try:
if devname == blkg.q.kobj.parent.name.string_().decode('utf-8'): if devname == blkg.q.mq_kobj.parent.name.string_().decode('utf-8'):
q_id = blkg.q.id.value_() q_id = blkg.q.id.value_()
if blkg.pd[plid]: if blkg.pd[plid]:
root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd') root_iocg = container_of(blkg.pd[plid], 'struct ioc_gq', 'pd')