diff --git a/drivers/md/md.h b/drivers/md/md.h index 8d881cc59799..b2076a165c10 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -207,6 +207,7 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ + Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, } return 0; } + +static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, + int sectors) +{ + sector_t first_bad; + int bad_sectors; + + return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); +} + extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 512746551f36..2ea1710a3b70 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev) return false; } + +/** + * raid1_check_read_range() - check a given read range for bad blocks, + * available read length is returned; + * @rdev: the rdev to read; + * @this_sector: read position; + * @len: read length; + * + * helper function for read_balance() + * + * 1) If there are no bad blocks in the range, @len is returned; + * 2) If the range are all bad blocks, 0 is returned; + * 3) If there are partial bad blocks: + * - If the bad block range starts after @this_sector, the length of first + * good region is returned; + * - If the bad block range starts before @this_sector, 0 is returned and + * the @len is updated to the offset into the region before we get to the + * good blocks; + */ +static inline int raid1_check_read_range(struct md_rdev *rdev, + sector_t this_sector, int *len) +{ + sector_t first_bad; + int bad_sectors; + + /* no bad block overlap */ + if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) + return *len; + + /* + * bad block range starts offset into our range so we can return the + * number of sectors before the bad blocks start. + */ + if (first_bad > this_sector) + return first_bad - this_sector; + + /* read range is fully consumed by bad blocks. */ + if (this_sector + *len <= first_bad + bad_sectors) + return 0; + + /* + * final case, bad block range starts before or at the start of our + * range but does not cover our entire range so we still return 0 but + * update the length with the number of sectors before we get to the + * good ones. + */ + *len = first_bad + bad_sectors - this_sector; + return 0; +} + +/* + * Check if read should choose the first rdev. + * + * Balance on the whole device if no resync is going on (recovery is ok) or + * below the resync window. Otherwise, take the first readable disk. + */ +static inline bool raid1_should_read_first(struct mddev *mddev, + sector_t this_sector, int len) +{ + if ((mddev->recovery_cp < this_sector + len)) + return true; + + if (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, READ, this_sector, + this_sector + len)) + return true; + + return false; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 286f8b16c7bd..afca975ec7f3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - r1_bio->bios[mirror] = NULL; to_put = bio; /* @@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !discard_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -582,211 +579,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector, return len; } -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ -static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +static void update_read_sectors(struct r1conf *conf, int disk, + sector_t this_sector, int len) { - const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; - int best_disk, best_dist_disk, best_pending_disk; - int has_nonrot_disk; + struct raid1_info *info = &conf->mirrors[disk]; + + atomic_inc(&info->rdev->nr_pending); + if (info->next_seq_sect != this_sector) + info->seq_start = this_sector; + info->next_seq_sect = this_sector + len; +} + +static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int len = r1_bio->sectors; int disk; - sector_t best_dist; - unsigned int min_pending; - struct md_rdev *rdev; - int choose_first; - int choose_next_idle; - - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ - retry: - sectors = r1_bio->sectors; - best_disk = -1; - best_dist_disk = -1; - best_dist = MaxSector; - best_pending_disk = -1; - min_pending = UINT_MAX; - best_good_sectors = 0; - has_nonrot_disk = 0; - choose_next_idle = 0; - clear_bit(R1BIO_FailFast, &r1_bio->state); - - if ((conf->mddev->recovery_cp < this_sector + sectors) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) - choose_first = 1; - else - choose_first = 0; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { - sector_t dist; - sector_t first_bad; - int bad_sectors; - unsigned int pending; - bool nonrot; + struct md_rdev *rdev; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; rdev = conf->mirrors[disk].rdev; - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || test_bit(Faulty, &rdev->flags)) - continue; - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < this_sector + sectors) - continue; - if (test_bit(WriteMostly, &rdev->flags)) { - /* Don't balance among write-mostly, just - * use the first as a last resort */ - if (best_dist_disk < 0) { - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (first_bad <= this_sector) - /* Cannot use this */ - continue; - best_good_sectors = first_bad - this_sector; - } else - best_good_sectors = sectors; - best_dist_disk = disk; - best_pending_disk = disk; - } - continue; - } - /* This is a reasonable device to use. It might - * even be best. - */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else { - if ((sectors > best_good_sectors) && (best_disk >= 0)) - best_disk = -1; - best_good_sectors = sectors; - } - - if (best_disk >= 0) - /* At least two disks to choose from so failfast is OK */ - set_bit(R1BIO_FailFast, &r1_bio->state); - - nonrot = bdev_nonrot(rdev->bdev); - has_nonrot_disk |= nonrot; - pending = atomic_read(&rdev->nr_pending); - dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first) { - best_disk = disk; - break; - } - /* Don't change to another disk for sequential reads */ - if (conf->mirrors[disk].next_seq_sect == this_sector - || dist == 0) { - int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; - struct raid1_info *mirror = &conf->mirrors[disk]; - - best_disk = disk; - /* - * If buffered sequential IO size exceeds optimal - * iosize, check if there is idle disk. If yes, choose - * the idle disk. read_balance could already choose an - * idle disk before noticing it's a sequential IO in - * this disk. This doesn't matter because this disk - * will idle, next time it will be utilized after the - * first disk has IO size exceeds optimal iosize. In - * this way, iosize of the first disk will be optimal - * iosize at least. iosize of the second disk might be - * small, but not a big deal since when the second disk - * starts IO, the first disk is likely still busy. - */ - if (nonrot && opt_iosize > 0 && - mirror->seq_start != MaxSector && - mirror->next_seq_sect > opt_iosize && - mirror->next_seq_sect - opt_iosize >= - mirror->seq_start) { - choose_next_idle = 1; - continue; - } - break; - } - - if (choose_next_idle) + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - if (min_pending > pending) { - min_pending = pending; - best_pending_disk = disk; - } - - if (dist < best_dist) { - best_dist = dist; - best_dist_disk = disk; + /* choose the first disk even if it has some bad blocks. */ + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > 0) { + update_read_sectors(conf, disk, this_sector, read_len); + *max_sectors = read_len; + return disk; } } + return -1; +} + +static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int best_disk = -1; + int best_len = 0; + int disk; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(WriteMostly, &rdev->flags)) + continue; + + /* keep track of the disk with the most readable sectors. */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > best_len) { + best_disk = disk; + best_len = read_len; + } + } + + if (best_disk != -1) { + *max_sectors = best_len; + update_read_sectors(conf, best_disk, this_sector, best_len); + } + + return best_disk; +} + +static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int bb_disk = -1; + int bb_read_len = 0; + int disk; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || + !test_bit(WriteMostly, &rdev->flags)) + continue; + + /* there are no bad blocks, we can use this disk */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len == r1_bio->sectors) { + update_read_sectors(conf, disk, this_sector, read_len); + return disk; + } + + /* + * there are partial bad blocks, choose the rdev with largest + * read length. + */ + if (read_len > bb_read_len) { + bb_disk = disk; + bb_read_len = read_len; + } + } + + if (bb_disk != -1) { + *max_sectors = bb_read_len; + update_read_sectors(conf, bb_disk, this_sector, bb_read_len); + } + + return bb_disk; +} + +static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio) +{ + /* TODO: address issues with this check and concurrency. */ + return conf->mirrors[disk].next_seq_sect == r1_bio->sector || + conf->mirrors[disk].head_position == r1_bio->sector; +} + +/* + * If buffered sequential IO size exceeds optimal iosize, check if there is idle + * disk. If yes, choose the idle disk. + */ +static bool should_choose_next(struct r1conf *conf, int disk) +{ + struct raid1_info *mirror = &conf->mirrors[disk]; + int opt_iosize; + + if (!test_bit(Nonrot, &mirror->rdev->flags)) + return false; + + opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9; + return opt_iosize > 0 && mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= mirror->seq_start; +} + +static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + if (!rdev || test_bit(Faulty, &rdev->flags)) + return false; + + /* still in recovery */ + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) + return false; + + /* don't read from slow disk unless have to */ + if (test_bit(WriteMostly, &rdev->flags)) + return false; + + /* don't split IO for bad blocks unless have to */ + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors)) + return false; + + return true; +} + +struct read_balance_ctl { + sector_t closest_dist; + int closest_dist_disk; + int min_pending; + int min_pending_disk; + int sequential_disk; + int readable_disks; +}; + +static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + struct read_balance_ctl ctl = { + .closest_dist_disk = -1, + .closest_dist = MaxSector, + .min_pending_disk = -1, + .min_pending = UINT_MAX, + .sequential_disk = -1, + }; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + sector_t dist; + unsigned int pending; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev_readable(rdev, r1_bio)) + continue; + + /* At least two disks to choose from so failfast is OK */ + if (ctl.readable_disks++ == 1) + set_bit(R1BIO_FailFast, &r1_bio->state); + + pending = atomic_read(&rdev->nr_pending); + dist = abs(r1_bio->sector - conf->mirrors[disk].head_position); + + /* Don't change to another disk for sequential reads */ + if (is_sequential(conf, disk, r1_bio)) { + if (!should_choose_next(conf, disk)) + return disk; + + /* + * Add 'pending' to avoid choosing this disk if + * there is other idle disk. + */ + pending++; + /* + * If there is no other idle disk, this disk + * will be chosen. + */ + ctl.sequential_disk = disk; + } + + if (ctl.min_pending > pending) { + ctl.min_pending = pending; + ctl.min_pending_disk = disk; + } + + if (ctl.closest_dist > dist) { + ctl.closest_dist = dist; + ctl.closest_dist_disk = disk; + } + } + + /* + * sequential IO size exceeds optimal iosize, however, there is no other + * idle disk, so choose the sequential disk. + */ + if (ctl.sequential_disk != -1 && ctl.min_pending != 0) + return ctl.sequential_disk; + /* * If all disks are rotational, choose the closest disk. If any disk is * non-rotational, choose the disk with less pending request even the * disk is rotational, which might/might not be optimal for raids with * mixed ratation/non-rotational disks depending on workload. */ - if (best_disk == -1) { - if (has_nonrot_disk || min_pending == 0) - best_disk = best_pending_disk; - else - best_disk = best_dist_disk; + if (ctl.min_pending_disk != -1 && + (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0)) + return ctl.min_pending_disk; + else + return ctl.closest_dist_disk; +} + +/* + * This routine returns the disk from which the requested read should be done. + * + * 1) If resync is in progress, find the first usable disk and use it even if it + * has some bad blocks. + * + * 2) Now that there is no resync, loop through all disks and skipping slow + * disks and disks with bad blocks for now. Only pay attention to key disk + * choice. + * + * 3) If we've made it this far, now look for disks with bad blocks and choose + * the one with most number of sectors. + * + * 4) If we are all the way at the end, we have no choice but to use a disk even + * if it is write mostly. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + int disk; + + clear_bit(R1BIO_FailFast, &r1_bio->state); + + if (raid1_should_read_first(conf->mddev, r1_bio->sector, + r1_bio->sectors)) + return choose_first_rdev(conf, r1_bio, max_sectors); + + disk = choose_best_rdev(conf, r1_bio); + if (disk >= 0) { + *max_sectors = r1_bio->sectors; + update_read_sectors(conf, disk, r1_bio->sector, + r1_bio->sectors); + return disk; } - if (best_disk >= 0) { - rdev = conf->mirrors[best_disk].rdev; - if (!rdev) - goto retry; - atomic_inc(&rdev->nr_pending); - sectors = best_good_sectors; + /* + * If we are here it means we didn't find a perfectly good disk so + * now spend a bit more time trying to find one with the most good + * sectors. + */ + disk = choose_bb_rdev(conf, r1_bio, max_sectors); + if (disk >= 0) + return disk; - if (conf->mirrors[best_disk].next_seq_sect != this_sector) - conf->mirrors[best_disk].seq_start = this_sector; - - conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; - } - *max_sectors = sectors; - - return best_disk; + return choose_slow_rdev(conf, r1_bio, max_sectors); } static void wake_up_barrier(struct r1conf *conf) @@ -1760,6 +1858,52 @@ static int raid1_spare_active(struct mddev *mddev) return count; } +static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, + bool replacement) +{ + struct raid1_info *info = conf->mirrors + disk; + + if (replacement) + info += conf->raid_disks; + + if (info->rdev) + return false; + + if (bdev_nonrot(rdev->bdev)) { + set_bit(Nonrot, &rdev->flags); + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); + } + + rdev->raid_disk = disk; + info->head_position = 0; + info->seq_start = MaxSector; + WRITE_ONCE(info->rdev, rdev); + + return true; +} + +static bool raid1_remove_conf(struct r1conf *conf, int disk) +{ + struct raid1_info *info = conf->mirrors + disk; + struct md_rdev *rdev = info->rdev; + + if (!rdev || test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) + return false; + + /* Only remove non-faulty devices if recovery is not possible. */ + if (!test_bit(Faulty, &rdev->flags) && + rdev->mddev->recovery_disabled != conf->recovery_disabled && + rdev->mddev->degraded < conf->raid_disks) + return false; + + if (test_and_clear_bit(Nonrot, &rdev->flags)) + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1); + + WRITE_ONCE(info->rdev, NULL); + return true; +} + static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1795,15 +1939,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); - p->head_position = 0; - rdev->raid_disk = mirror; + raid1_add_conf(conf, rdev, mirror, false); err = 0; /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - WRITE_ONCE(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1813,13 +1955,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err && repl_slot >= 0) { /* Add this device as a replacement */ - p = conf->mirrors + repl_slot; clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); - rdev->raid_disk = repl_slot; + raid1_add_conf(conf, rdev, repl_slot, true); err = 0; conf->fullsync = 1; - WRITE_ONCE(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1836,27 +1976,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->raid_disks)) goto abort; - if (rdev != p->rdev) - p = conf->mirrors + conf->raid_disks + number; + if (rdev != p->rdev) { + number += conf->raid_disks; + p = conf->mirrors + number; + } print_conf(conf); if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { + if (!raid1_remove_conf(conf, number)) { err = -EBUSY; goto abort; } - /* Only remove non-faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - mddev->degraded < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - WRITE_ONCE(p->rdev, NULL); - if (conf->mirrors[conf->raid_disks + number].rdev) { + + if (number < conf->raid_disks && + conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1944,8 +2077,6 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - sector_t first_bad; - int bad_sectors; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { @@ -1955,14 +2086,11 @@ static void end_sync_write(struct bio *bio) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) + } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, r1_bio->sectors)) { set_bit(R1BIO_MadeGood, &r1_bio->state); + } put_sync_write_buf(r1_bio, uptodate); } @@ -2279,16 +2407,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, sect, s) == 0) { atomic_inc(&rdev->nr_pending); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) @@ -3006,23 +3130,17 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); + conf->raid_disks = mddev->raid_disks; rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) - continue; - if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + mddev->raid_disks + disk_idx; - else - disk = conf->mirrors + disk_idx; - if (disk->rdev) + if (disk_idx >= conf->raid_disks || disk_idx < 0) + continue; + + if (!raid1_add_conf(conf, rdev, disk_idx, + test_bit(Replacement, &rdev->flags))) goto abort; - disk->rdev = rdev; - disk->head_position = 0; - disk->seq_start = MaxSector; } - conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 14d4211a123a..5300cbaa58a4 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -71,6 +71,7 @@ struct r1conf { * allow for replacements. */ int raid_disks; + int nonrot_disks; spinlock_t device_lock; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7412066ea22c..8aecdb1ccc16 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio) * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. - */ - sector_t first_bad; - int bad_sectors; - - /* + * * Do not set R10BIO_Uptodate if the current device is * rebuilding or Faulty. This is because we cannot use * such device for properly reading the data back (we could @@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio) set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, + r10_bio->sectors) && + !discard_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; @@ -753,17 +748,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, best_good_sectors = 0; do_balance = 1; clear_bit(R10BIO_FailFast, &r10_bio->state); - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on (recovery is ok), or below - * the resync window. We take the first readable disk when - * above the resync window. - */ - if ((conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) + + if (raid1_should_read_first(conf->mddev, this_sector, sectors)) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { @@ -1330,10 +1316,7 @@ retry_wait: } if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; /* * Discard request doesn't care the write result @@ -1342,9 +1325,8 @@ retry_wait: if (!r10_bio->sectors) continue; - is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { + if (rdev_has_badblock(rdev, dev_sector, + r10_bio->sectors) < 0) { /* * Mustn't write here until the bad block * is acknowledged @@ -2290,8 +2272,6 @@ static void end_sync_write(struct bio *bio) struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; - sector_t first_bad; - int bad_sectors; int slot; int repl; struct md_rdev *rdev = NULL; @@ -2312,11 +2292,10 @@ static void end_sync_write(struct bio *bio) &rdev->mddev->recovery); set_bit(R10BIO_WriteError, &r10_bio->state); } - } else if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) + } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, + r10_bio->sectors)) { set_bit(R10BIO_MadeGood, &r10_bio->state); + } rdev_dec_pending(rdev, mddev); @@ -2597,11 +2576,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op) { - sector_t first_bad; - int bad_sectors; - - if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) + if (rdev_has_badblock(rdev, sector, sectors) && + (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) /* success */ @@ -2658,16 +2634,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - d = r10_bio->devs[sl].devnum; rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && - is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, + r10_bio->devs[sl].addr + sect, + s) == 0) { atomic_inc(&rdev->nr_pending); success = sync_page_io(rdev, r10_bio->devs[sl].addr + diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7ec445f49f1c..48129de21aec 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1210,10 +1210,8 @@ again: */ while (op_is_write(op) && rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors); + int bad = rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf)); if (!bad) break; @@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi) struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; struct md_rdev *rdev; - sector_t first_bad; - int bad_sectors; int replacement = 0; for (i = 0 ; i < disks; i++) { @@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi) if (replacement) { if (bi->bi_status) md_error(conf->mddev, rdev); - else if (is_badblock(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) + else if (rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (bi->bi_status) { @@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - } else if (is_badblock(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) { + } else if (rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) /* That was a successful write so make @@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Now to look around and see what can be done */ for (i=disks; i--; ) { struct md_rdev *rdev; - sector_t first_bad; - int bad_sectors; int is_bad = 0; dev = &sh->dev[i]; @@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && - !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) + !rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) set_bit(R5_ReadRepl, &dev->flags); else { if (rdev && !test_bit(Faulty, &rdev->flags)) @@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors); + is_bad = rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf)); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) || is_bad < 0)) { @@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct r5conf *conf = mddev->private; struct bio *align_bio; struct md_rdev *rdev; - sector_t sector, end_sector, first_bad; - int bad_sectors, dd_idx; + sector_t sector, end_sector; + int dd_idx; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); - if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, - &bad_sectors)) { + if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) { rdev_dec_pending(rdev, mddev); return 0; }