blk-mq: improve layout of blk_mq_hw_ctx

Various cache line optimizations: - Move delay_work towards the end. It's huge, and we don't use it a lot (only SCSI). - Move the atomic state into the same cacheline as the the dispatch list and lock. - Rearrange a few members to pack it better. - Shrink the max-order for dispatch accounting from 10 to 7. This means that ->dispatched[] and ->run now take up their own cacheline. This shrinks struct blk_mq_hw_ctx down to 8 cachelines. Signed-off-by: Jens Axboe <axboe@fb.com>
2016-08-25 08:00:28 -06:00 · 2016-08-25 08:00:28 -06:00 · 8d354f133e
commit 8d354f133e
parent 27489a3c82
1 changed files with 5 additions and 4 deletions
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@ -22,11 +22,10 @@ struct blk_mq_hw_ctx {
 	struct {
 		spinlock_t		lock;
 		struct list_head	dispatch;
+		unsigned long		state;		/* BLK_MQ_S_* flags */
 	} ____cacheline_aligned_in_smp;

-	unsigned long		state;		/* BLK_MQ_S_* flags */
 	struct work_struct	run_work;
-	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
 	int			next_cpu;
 	int			next_cpu_batch;
@ -40,8 +39,8 @@ struct blk_mq_hw_ctx {

 	struct blk_mq_ctxmap	ctx_map;

-	unsigned int		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
+	unsigned int		nr_ctx;

 	atomic_t		wait_index;

@ -49,7 +48,7 @@ struct blk_mq_hw_ctx {

 	unsigned long		queued;
 	unsigned long		run;
-#define BLK_MQ_MAX_DISPATCH_ORDER	10
+#define BLK_MQ_MAX_DISPATCH_ORDER	7
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];

 	unsigned int		numa_node;
@ -57,6 +56,8 @@ struct blk_mq_hw_ctx {

 	atomic_t		nr_active;

+	struct delayed_work	delay_work;
+
 	struct blk_mq_cpu_notifier	cpu_notifier;
 	struct kobject		kobj;