raid5: add a per-stripe lock
Add a per-stripe lock to protect stripe specific data. The purpose is to reduce lock contention of conf->device_lock. stripe ->toread, ->towrite are protected by per-stripe lock. Accessing bio list of the stripe is always serialized by this lock, so adding bio to the lists (add_stripe_bio()) and removing bio from the lists (like ops_run_biofill()) not race. If bio in ->read, ->written ... list are not shared by multiple stripes, we don't need any lock to protect ->read, ->written, because STRIPE_ACTIVE will protect them. If the bio are shared, there are two protections: 1. bi_phys_segments acts as a reference count 2. traverse the list uses r5_next_bio, which makes traverse never access bio not belonging to the stripe Let's have an example: | stripe1 | stripe2 | stripe3 | ...bio1......|bio2|bio3|....bio4..... stripe2 has 4 bios, when it's finished, it will decrement bi_phys_segments for all bios, but only end_bio for bio2 and bio3. bio1->bi_next still points to bio2, but this doesn't matter. When stripe1 is finished, it will not touch bio2 because of r5_next_bio check. Next time stripe1 will end_bio for bio1 and stripe3 will end_bio bio4. before add_stripe_bio() addes a bio to a stripe, we already increament the bio bi_phys_segments, so don't worry other stripes release the bio. Signed-off-by: Shaohua Li <shli@fusionio.com> Signed-off-by: NeilBrown <neilb@suse.de>
This commit is contained in:
		| @@ -762,14 +762,12 @@ static void ops_complete_biofill(void *stripe_head_ref) | ||||
| { | ||||
| 	struct stripe_head *sh = stripe_head_ref; | ||||
| 	struct bio *return_bi = NULL; | ||||
| 	struct r5conf *conf = sh->raid_conf; | ||||
| 	int i; | ||||
|  | ||||
| 	pr_debug("%s: stripe %llu\n", __func__, | ||||
| 		(unsigned long long)sh->sector); | ||||
|  | ||||
| 	/* clear completed biofills */ | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	for (i = sh->disks; i--; ) { | ||||
| 		struct r5dev *dev = &sh->dev[i]; | ||||
|  | ||||
| @@ -795,7 +793,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state); | ||||
|  | ||||
| 	return_io(return_bi); | ||||
| @@ -807,7 +804,6 @@ static void ops_complete_biofill(void *stripe_head_ref) | ||||
| static void ops_run_biofill(struct stripe_head *sh) | ||||
| { | ||||
| 	struct dma_async_tx_descriptor *tx = NULL; | ||||
| 	struct r5conf *conf = sh->raid_conf; | ||||
| 	struct async_submit_ctl submit; | ||||
| 	int i; | ||||
|  | ||||
| @@ -818,10 +814,10 @@ static void ops_run_biofill(struct stripe_head *sh) | ||||
| 		struct r5dev *dev = &sh->dev[i]; | ||||
| 		if (test_bit(R5_Wantfill, &dev->flags)) { | ||||
| 			struct bio *rbi; | ||||
| 			spin_lock_irq(&conf->device_lock); | ||||
| 			spin_lock_irq(&sh->stripe_lock); | ||||
| 			dev->read = rbi = dev->toread; | ||||
| 			dev->toread = NULL; | ||||
| 			spin_unlock_irq(&conf->device_lock); | ||||
| 			spin_unlock_irq(&sh->stripe_lock); | ||||
| 			while (rbi && rbi->bi_sector < | ||||
| 				dev->sector + STRIPE_SECTORS) { | ||||
| 				tx = async_copy_data(0, rbi, dev->page, | ||||
| @@ -1157,12 +1153,12 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) | ||||
| 		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { | ||||
| 			struct bio *wbi; | ||||
|  | ||||
| 			spin_lock_irq(&sh->raid_conf->device_lock); | ||||
| 			spin_lock_irq(&sh->stripe_lock); | ||||
| 			chosen = dev->towrite; | ||||
| 			dev->towrite = NULL; | ||||
| 			BUG_ON(dev->written); | ||||
| 			wbi = dev->written = chosen; | ||||
| 			spin_unlock_irq(&sh->raid_conf->device_lock); | ||||
| 			spin_unlock_irq(&sh->stripe_lock); | ||||
|  | ||||
| 			while (wbi && wbi->bi_sector < | ||||
| 				dev->sector + STRIPE_SECTORS) { | ||||
| @@ -1467,6 +1463,8 @@ static int grow_one_stripe(struct r5conf *conf) | ||||
| 	init_waitqueue_head(&sh->ops.wait_for_ops); | ||||
| 	#endif | ||||
|  | ||||
| 	spin_lock_init(&sh->stripe_lock); | ||||
|  | ||||
| 	if (grow_buffers(sh)) { | ||||
| 		shrink_buffers(sh); | ||||
| 		kmem_cache_free(conf->slab_cache, sh); | ||||
| @@ -2353,8 +2351,15 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | ||||
| 		(unsigned long long)bi->bi_sector, | ||||
| 		(unsigned long long)sh->sector); | ||||
|  | ||||
|  | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	/* | ||||
| 	 * If several bio share a stripe. The bio bi_phys_segments acts as a | ||||
| 	 * reference count to avoid race. The reference count should already be | ||||
| 	 * increased before this function is called (for example, in | ||||
| 	 * make_request()), so other bio sharing this stripe will not free the | ||||
| 	 * stripe. If a stripe is owned by one stripe, the stripe lock will | ||||
| 	 * protect it. | ||||
| 	 */ | ||||
| 	spin_lock_irq(&sh->stripe_lock); | ||||
| 	if (forwrite) { | ||||
| 		bip = &sh->dev[dd_idx].towrite; | ||||
| 		if (*bip == NULL) | ||||
| @@ -2388,7 +2393,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | ||||
| 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) | ||||
| 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); | ||||
| 	} | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	spin_unlock_irq(&sh->stripe_lock); | ||||
|  | ||||
| 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", | ||||
| 		(unsigned long long)(*bip)->bi_sector, | ||||
| @@ -2404,7 +2409,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in | ||||
|  | ||||
|  overlap: | ||||
| 	set_bit(R5_Overlap, &sh->dev[dd_idx].flags); | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	spin_unlock_irq(&sh->stripe_lock); | ||||
| 	return 0; | ||||
| } | ||||
|  | ||||
| @@ -2454,11 +2459,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, | ||||
| 				rdev_dec_pending(rdev, conf->mddev); | ||||
| 			} | ||||
| 		} | ||||
| 		spin_lock_irq(&conf->device_lock); | ||||
| 		spin_lock_irq(&sh->stripe_lock); | ||||
| 		/* fail all writes first */ | ||||
| 		bi = sh->dev[i].towrite; | ||||
| 		sh->dev[i].towrite = NULL; | ||||
| 		spin_unlock_irq(&conf->device_lock); | ||||
| 		spin_unlock_irq(&sh->stripe_lock); | ||||
| 		if (bi) { | ||||
| 			s->to_write--; | ||||
| 			bitmap_end = 1; | ||||
| @@ -3192,7 +3197,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | ||||
|  | ||||
| 	/* Now to look around and see what can be done */ | ||||
| 	rcu_read_lock(); | ||||
| 	spin_lock_irq(&conf->device_lock); | ||||
| 	for (i=disks; i--; ) { | ||||
| 		struct md_rdev *rdev; | ||||
| 		sector_t first_bad; | ||||
| @@ -3338,7 +3342,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) | ||||
| 				do_recovery = 1; | ||||
| 		} | ||||
| 	} | ||||
| 	spin_unlock_irq(&conf->device_lock); | ||||
| 	if (test_bit(STRIPE_SYNCING, &sh->state)) { | ||||
| 		/* If there is a failed device being replaced, | ||||
| 		 *     we must be recovering. | ||||
|   | ||||
| @@ -210,6 +210,7 @@ struct stripe_head { | ||||
| 	int			disks;		/* disks in stripe */ | ||||
| 	enum check_states	check_state; | ||||
| 	enum reconstruct_states reconstruct_state; | ||||
| 	spinlock_t		stripe_lock; | ||||
| 	/** | ||||
| 	 * struct stripe_operations | ||||
| 	 * @target - STRIPE_OP_COMPUTE_BLK target | ||||
|   | ||||
		Reference in New Issue
	
	Block a user