From b82592199032bf7c778f861b936287e37ebc9f62 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Fri, 2 Nov 2018 18:02:48 +0000
Subject: [PATCH 001/351] genirq/affinity: Spread IRQs to all available NUMA
 nodes

If the number of NUMA nodes exceeds the number of MSI/MSI-X interrupts
which are allocated for a device, the interrupt affinity spreading code
fails to spread them across all nodes.

The reason is, that the spreading code starts from node 0 and continues up
to the number of interrupts requested for allocation. This leaves the nodes
past the last interrupt unused.

This results in interrupt concentration on the first nodes which violates
the assumption of the block layer that all nodes are covered evenly. As a
consequence the NUMA nodes above the number of interrupts are all assigned
to hardware queue 0 and therefore NUMA node 0, which results in bad
performance and has CPU hotplug implications, because queue 0 gets shut
down when the last CPU of node 0 is offlined.

Go over all NUMA nodes and assign them round-robin to all requested
interrupts to solve this.

[ tglx: Massaged changelog ]

Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Cc: Michael Kelley <mikelley@microsoft.com>
Link: https://lkml.kernel.org/r/20181102180248.13583-1-longli@linuxonhyperv.com
---
 kernel/irq/affinity.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f4f29b9d90ee..e12cdf637c71 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -117,12 +117,11 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_copy(masks + curvec, node_to_cpumask[n]);
-			if (++done == numvecs)
-				break;
+			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = affd->pre_vectors;
 		}
+		done = numvecs;
 		goto out;
 	}
 

From 5c903e108d0b005cf59904ca3520934fca4b9439 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 2 Nov 2018 22:59:49 +0800
Subject: [PATCH 002/351] genirq/affinity: Move two stage affinity spreading
 into a helper function

No functional change. Prepares for supporting allocating and affinitizing
interrupt sets.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Hannes Reinecke <hare@suse.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Link: https://lkml.kernel.org/r/20181102145951.31979-3-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 92 ++++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 36 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12cdf637c71..2f9812b6035e 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -94,7 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 	return nodes;
 }
 
-static int irq_build_affinity_masks(const struct irq_affinity *affd,
+static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
@@ -165,6 +165,58 @@ out:
 	return done;
 }
 
+/*
+ * build affinity in two stages:
+ *	1) spread present CPU on these vectors
+ *	2) spread other possible CPUs on these vectors
+ */
+static int irq_build_affinity_masks(const struct irq_affinity *affd,
+				    int startvec, int numvecs,
+				    cpumask_var_t *node_to_cpumask,
+				    struct cpumask *masks)
+{
+	int curvec = startvec, usedvecs = -1;
+	cpumask_var_t nmsk, npresmsk;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+			return usedvecs;
+
+	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+			goto fail;
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	build_node_to_cpumask(node_to_cpumask);
+
+	/* Spread on present CPUs starting from affd->pre_vectors */
+	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
+					    node_to_cpumask, cpu_present_mask,
+					    nmsk, masks);
+
+	/*
+	 * Spread on non present CPUs starting from the next vector to be
+	 * handled. If the spreading of present CPUs already exhausted the
+	 * vector space, assign the non present CPUs to the already spread
+	 * out vectors.
+	 */
+	if (usedvecs >= numvecs)
+		curvec = affd->pre_vectors;
+	else
+		curvec = affd->pre_vectors + usedvecs;
+	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
+					     node_to_cpumask, npresmsk,
+					     nmsk, masks);
+	put_online_cpus();
+
+	free_cpumask_var(npresmsk);
+
+ fail:
+	free_cpumask_var(nmsk);
+
+	return usedvecs;
+}
+
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs:	The total number of vectors
@@ -177,7 +229,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	int curvec, usedvecs;
-	cpumask_var_t nmsk, npresmsk, *node_to_cpumask;
+	cpumask_var_t *node_to_cpumask;
 	struct cpumask *masks = NULL;
 
 	/*
@@ -187,15 +239,9 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (nvecs == affd->pre_vectors + affd->post_vectors)
 		return NULL;
 
-	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-		return NULL;
-
-	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
-		goto outcpumsk;
-
 	node_to_cpumask = alloc_node_to_cpumask();
 	if (!node_to_cpumask)
-		goto outnpresmsk;
+		return NULL;
 
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
@@ -205,30 +251,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 
-	/* Stabilize the cpumasks */
-	get_online_cpus();
-	build_node_to_cpumask(node_to_cpumask);
-
-	/* Spread on present CPUs starting from affd->pre_vectors */
 	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
-					    node_to_cpumask, cpu_present_mask,
-					    nmsk, masks);
-
-	/*
-	 * Spread on non present CPUs starting from the next vector to be
-	 * handled. If the spreading of present CPUs already exhausted the
-	 * vector space, assign the non present CPUs to the already spread
-	 * out vectors.
-	 */
-	if (usedvecs >= affvecs)
-		curvec = affd->pre_vectors;
-	else
-		curvec = affd->pre_vectors + usedvecs;
-	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-	usedvecs += irq_build_affinity_masks(affd, curvec, affvecs,
-					     node_to_cpumask, npresmsk,
-					     nmsk, masks);
-	put_online_cpus();
+					    node_to_cpumask, masks);
 
 	/* Fill out vectors at the end that don't need affinity */
 	if (usedvecs >= affvecs)
@@ -240,10 +264,6 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
-outnpresmsk:
-	free_cpumask_var(npresmsk);
-outcpumsk:
-	free_cpumask_var(nmsk);
 	return masks;
 }
 

From 060746d9e394084b7401e7532f2de528ecbfb521 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 2 Nov 2018 22:59:50 +0800
Subject: [PATCH 003/351] genirq/affinity: Pass first vector to
 __irq_build_affinity_masks()

No functional change.

Prepares for support of allocating and affinitizing sets of interrupts, in
which each set of interrupts needs a full two stage spreading. The first
vector argument is necessary for this so the affinitizing starts from the
first vector of each set.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Hannes Reinecke <hare@suse.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sagi Grimberg <sagi@grimberg.me>
Link: https://lkml.kernel.org/r/20181102145951.31979-4-ming.lei@redhat.com
---
 kernel/irq/affinity.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 2f9812b6035e..e028b773e38a 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -95,14 +95,14 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 }
 
 static int __irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs,
+				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
 				    struct cpumask *nmsk,
 				    struct cpumask *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
-	int last_affv = affd->pre_vectors + numvecs;
+	int last_affv = firstvec + numvecs;
 	int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 
@@ -119,7 +119,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		for_each_node_mask(n, nodemsk) {
 			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
 			if (++curvec == last_affv)
-				curvec = affd->pre_vectors;
+				curvec = firstvec;
 		}
 		done = numvecs;
 		goto out;
@@ -129,7 +129,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
-		vecs_per_node = (numvecs - (curvec - affd->pre_vectors)) / nodes;
+		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
 		cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
@@ -157,7 +157,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 		if (done >= numvecs)
 			break;
 		if (curvec >= last_affv)
-			curvec = affd->pre_vectors;
+			curvec = firstvec;
 		--nodes;
 	}
 
@@ -190,8 +190,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 
 	/* Spread on present CPUs starting from affd->pre_vectors */
 	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
-					    node_to_cpumask, cpu_present_mask,
-					    nmsk, masks);
+					      affd->pre_vectors,
+					      node_to_cpumask,
+					      cpu_present_mask, nmsk, masks);
 
 	/*
 	 * Spread on non present CPUs starting from the next vector to be
@@ -205,8 +206,9 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 		curvec = affd->pre_vectors + usedvecs;
 	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
 	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
-					     node_to_cpumask, npresmsk,
-					     nmsk, masks);
+					       affd->pre_vectors,
+					       node_to_cpumask, npresmsk,
+					       nmsk, masks);
 	put_online_cpus();
 
 	free_cpumask_var(npresmsk);

From 6da4b3ab9a6e9b1b5f90322ab3fa3a7dd18edb19 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Nov 2018 22:59:51 +0800
Subject: [PATCH 004/351] genirq/affinity: Add support for allocating interrupt
 sets

A driver may have a need to allocate multiple sets of MSI/MSI-X interrupts,
and have them appropriately affinitized.

Add support for defining a number of sets in the irq_affinity structure, of
varying sizes, and get each set affinitized correctly across the machine.

[ tglx: Minor changelog tweaks ]

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-block@vger.kernel.org
Link: https://lkml.kernel.org/r/20181102145951.31979-5-ming.lei@redhat.com
---
 drivers/pci/msi.c         | 14 +++++++
 include/linux/interrupt.h |  4 ++
 kernel/irq/affinity.c     | 77 +++++++++++++++++++++++++++------------
 3 files changed, 72 insertions(+), 23 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index af24ed50a245..265ed3e4c920 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1036,6 +1036,13 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (maxvec < minvec)
 		return -ERANGE;
 
+	/*
+	 * If the caller is passing in sets, we can't support a range of
+	 * vectors. The caller needs to handle that.
+	 */
+	if (affd && affd->nr_sets && minvec != maxvec)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(dev->msi_enabled))
 		return -EINVAL;
 
@@ -1087,6 +1094,13 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 	if (maxvec < minvec)
 		return -ERANGE;
 
+	/*
+	 * If the caller is passing in sets, we can't support a range of
+	 * supported vectors. The caller needs to handle that.
+	 */
+	if (affd && affd->nr_sets && minvec != maxvec)
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(dev->msix_enabled))
 		return -EINVAL;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 1d6711c28271..ca397ff40836 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -247,10 +247,14 @@ struct irq_affinity_notify {
  *			the MSI(-X) vector space
  * @post_vectors:	Don't apply affinity to @post_vectors at end of
  *			the MSI(-X) vector space
+ * @nr_sets:		Length of passed in *sets array
+ * @sets:		Number of affinitized sets
  */
 struct irq_affinity {
 	int	pre_vectors;
 	int	post_vectors;
+	int	nr_sets;
+	int	*sets;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e028b773e38a..08c904eb7279 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -171,28 +171,29 @@ out:
  *	2) spread other possible CPUs on these vectors
  */
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs,
+				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
 				    struct cpumask *masks)
 {
-	int curvec = startvec, usedvecs = -1;
+	int curvec = startvec, nr_present, nr_others;
+	int ret = -ENOMEM;
 	cpumask_var_t nmsk, npresmsk;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
-			return usedvecs;
+			return ret;
 
 	if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
 			goto fail;
 
+	ret = 0;
 	/* Stabilize the cpumasks */
 	get_online_cpus();
 	build_node_to_cpumask(node_to_cpumask);
 
 	/* Spread on present CPUs starting from affd->pre_vectors */
-	usedvecs = __irq_build_affinity_masks(affd, curvec, numvecs,
-					      affd->pre_vectors,
-					      node_to_cpumask,
-					      cpu_present_mask, nmsk, masks);
+	nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
+						firstvec, node_to_cpumask,
+						cpu_present_mask, nmsk, masks);
 
 	/*
 	 * Spread on non present CPUs starting from the next vector to be
@@ -200,23 +201,24 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	 * vector space, assign the non present CPUs to the already spread
 	 * out vectors.
 	 */
-	if (usedvecs >= numvecs)
-		curvec = affd->pre_vectors;
+	if (nr_present >= numvecs)
+		curvec = firstvec;
 	else
-		curvec = affd->pre_vectors + usedvecs;
+		curvec = firstvec + nr_present;
 	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
-	usedvecs += __irq_build_affinity_masks(affd, curvec, numvecs,
-					       affd->pre_vectors,
-					       node_to_cpumask, npresmsk,
-					       nmsk, masks);
+	nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
+					       firstvec, node_to_cpumask,
+					       npresmsk, nmsk, masks);
 	put_online_cpus();
 
+	if (nr_present < numvecs)
+			WARN_ON(nr_present + nr_others < numvecs);
+
 	free_cpumask_var(npresmsk);
 
  fail:
 	free_cpumask_var(nmsk);
-
-	return usedvecs;
+	return ret;
 }
 
 /**
@@ -233,6 +235,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int curvec, usedvecs;
 	cpumask_var_t *node_to_cpumask;
 	struct cpumask *masks = NULL;
+	int i, nr_sets;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -253,8 +256,28 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
 
-	usedvecs = irq_build_affinity_masks(affd, curvec, affvecs,
-					    node_to_cpumask, masks);
+	/*
+	 * Spread on present CPUs starting from affd->pre_vectors. If we
+	 * have multiple sets, build each sets affinity mask separately.
+	 */
+	nr_sets = affd->nr_sets;
+	if (!nr_sets)
+		nr_sets = 1;
+
+	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
+		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		int ret;
+
+		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
+						curvec, node_to_cpumask, masks);
+		if (ret) {
+				kfree(masks);
+				masks = NULL;
+				goto outnodemsk;
+		}
+		curvec += this_vecs;
+		usedvecs += this_vecs;
+	}
 
 	/* Fill out vectors at the end that don't need affinity */
 	if (usedvecs >= affvecs)
@@ -279,13 +302,21 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
 {
 	int resv = affd->pre_vectors + affd->post_vectors;
 	int vecs = maxvec - resv;
-	int ret;
+	int set_vecs;
 
 	if (resv > minvec)
 		return 0;
 
-	get_online_cpus();
-	ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
-	put_online_cpus();
-	return ret;
+	if (affd->nr_sets) {
+		int i;
+
+		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
+			set_vecs += affd->sets[i];
+	} else {
+		get_online_cpus();
+		set_vecs = cpumask_weight(cpu_possible_mask);
+		put_online_cpus();
+	}
+
+	return resv + min(set_vecs, vecs);
 }

From ea2c18e1044e9ed8f780c965c50432060ab0e355 Mon Sep 17 00:00:00 2001
From: Masato Suzuki <masato.suzuki@wdc.com>
Date: Tue, 30 Oct 2018 16:14:05 +0900
Subject: [PATCH 005/351] null_blk: Add conventional zone configuration for
 zoned support

Allow the creation of conventional zones by adding the zone_nr_conv
configuration attribute. This new attribute is used only for zoned devices
and indicates the number of conventional zones to create. The default value
is 0. Since host-managed zoned block devices must always have at least one
sequential zone, if the value of zone_nr_conv is larger than or equal to
the total number of zones of the device nr_zones, zone_nr_conv is
automatically changed to nr_zones - 1.

Reviewed-by: Matias Bjorling <matias.bjorling@wdc.com>
Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Masato Suzuki <masato.suzuki@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk.h       |  1 +
 drivers/block/null_blk_main.c  |  7 +++++++
 drivers/block/null_blk_zoned.c | 27 ++++++++++++++++++++++++++-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index 7685df43f1ef..b3df2793e7cd 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -49,6 +49,7 @@ struct nullb_device {
 	unsigned long completion_nsec; /* time in ns to complete a request */
 	unsigned long cache_size; /* disk cache size in MB */
 	unsigned long zone_size; /* zone size in MB if device is zoned */
+	unsigned int zone_nr_conv; /* number of conventional zones */
 	unsigned int submit_queues; /* number of submission queues */
 	unsigned int home_node; /* home node for the device */
 	unsigned int queue_mode; /* block interface */
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 09339203dfba..9c045bee0985 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -188,6 +188,10 @@ static unsigned long g_zone_size = 256;
 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
 MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
 
+static unsigned int g_zone_nr_conv;
+module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
+MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
+
 static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
@@ -293,6 +297,7 @@ NULLB_DEVICE_ATTR(mbps, uint);
 NULLB_DEVICE_ATTR(cache_size, ulong);
 NULLB_DEVICE_ATTR(zoned, bool);
 NULLB_DEVICE_ATTR(zone_size, ulong);
+NULLB_DEVICE_ATTR(zone_nr_conv, uint);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -407,6 +412,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_badblocks,
 	&nullb_device_attr_zoned,
 	&nullb_device_attr_zone_size,
+	&nullb_device_attr_zone_nr_conv,
 	NULL,
 };
 
@@ -520,6 +526,7 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->use_per_node_hctx = g_use_per_node_hctx;
 	dev->zoned = g_zoned;
 	dev->zone_size = g_zone_size;
+	dev->zone_nr_conv = g_zone_nr_conv;
 	return dev;
 }
 
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
index c0b0e4a3fa8f..5d1c261a2cfd 100644
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -29,7 +29,25 @@ int null_zone_init(struct nullb_device *dev)
 	if (!dev->zones)
 		return -ENOMEM;
 
-	for (i = 0; i < dev->nr_zones; i++) {
+	if (dev->zone_nr_conv >= dev->nr_zones) {
+		dev->zone_nr_conv = dev->nr_zones - 1;
+		pr_info("null_blk: changed the number of conventional zones to %u",
+			dev->zone_nr_conv);
+	}
+
+	for (i = 0; i <  dev->zone_nr_conv; i++) {
+		struct blk_zone *zone = &dev->zones[i];
+
+		zone->start = sector;
+		zone->len = dev->zone_size_sects;
+		zone->wp = zone->start + zone->len;
+		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+		zone->cond = BLK_ZONE_COND_NOT_WP;
+
+		sector += dev->zone_size_sects;
+	}
+
+	for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
 		struct blk_zone *zone = &dev->zones[i];
 
 		zone->start = zone->wp = sector;
@@ -98,6 +116,8 @@ void null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 		if (zone->wp == zone->start + zone->len)
 			zone->cond = BLK_ZONE_COND_FULL;
 		break;
+	case BLK_ZONE_COND_NOT_WP:
+		break;
 	default:
 		/* Invalid zone condition */
 		cmd->error = BLK_STS_IOERR;
@@ -111,6 +131,11 @@ void null_zone_reset(struct nullb_cmd *cmd, sector_t sector)
 	unsigned int zno = null_zone_no(dev, sector);
 	struct blk_zone *zone = &dev->zones[zno];
 
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		cmd->error = BLK_STS_IOERR;
+		return;
+	}
+
 	zone->cond = BLK_ZONE_COND_EMPTY;
 	zone->wp = zone->start;
 }

From fa182a1fa97dff56cda742f22d17d666420cd27f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 08:54:23 -0600
Subject: [PATCH 006/351] sunvdc: convert to blk-mq

Convert from the old request_fn style driver to blk-mq.

Cc: David Miller <davem@davemloft.net>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 149 +++++++++++++++++++++++++++--------------
 1 file changed, 98 insertions(+), 51 deletions(-)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index b54fa6726303..95cb4ea8e402 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -6,7 +6,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/hdreg.h>
 #include <linux/genhd.h>
 #include <linux/cdrom.h>
@@ -66,9 +66,10 @@ struct vdc_port {
 
 	u64			max_xfer_size;
 	u32			vdisk_block_size;
+	u32			drain;
 
 	u64			ldc_timeout;
-	struct timer_list	ldc_reset_timer;
+	struct delayed_work	ldc_reset_timer_work;
 	struct work_struct	ldc_reset_work;
 
 	/* The server fills these in for us in the disk attribute
@@ -80,12 +81,14 @@ struct vdc_port {
 	u8			vdisk_mtype;
 	u32			vdisk_phys_blksz;
 
+	struct blk_mq_tag_set	tag_set;
+
 	char			disk_name[32];
 };
 
 static void vdc_ldc_reset(struct vdc_port *port);
 static void vdc_ldc_reset_work(struct work_struct *work);
-static void vdc_ldc_reset_timer(struct timer_list *t);
+static void vdc_ldc_reset_timer_work(struct work_struct *work);
 
 static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
 {
@@ -175,11 +178,8 @@ static void vdc_blk_queue_start(struct vdc_port *port)
 	 * handshake completes, so check for initial handshake before we've
 	 * allocated a disk.
 	 */
-	if (port->disk && blk_queue_stopped(port->disk->queue) &&
-	    vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) {
-		blk_start_queue(port->disk->queue);
-	}
-
+	if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
+		blk_mq_start_hw_queues(port->disk->queue);
 }
 
 static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
@@ -197,7 +197,7 @@ static void vdc_handshake_complete(struct vio_driver_state *vio)
 {
 	struct vdc_port *port = to_vdc_port(vio);
 
-	del_timer(&port->ldc_reset_timer);
+	cancel_delayed_work(&port->ldc_reset_timer_work);
 	vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
 	vdc_blk_queue_start(port);
 }
@@ -320,7 +320,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 
 	rqe->req = NULL;
 
-	__blk_end_request(req, (desc->status ? BLK_STS_IOERR : 0), desc->size);
+	blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
 
 	vdc_blk_queue_start(port);
 }
@@ -525,29 +525,40 @@ static int __send_request(struct request *req)
 	return err;
 }
 
-static void do_vdc_request(struct request_queue *rq)
+static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
-	struct request *req;
+	struct vdc_port *port = hctx->queue->queuedata;
+	struct vio_dring_state *dr;
+	unsigned long flags;
 
-	while ((req = blk_peek_request(rq)) != NULL) {
-		struct vdc_port *port;
-		struct vio_dring_state *dr;
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
 
-		port = req->rq_disk->private_data;
-		dr = &port->vio.drings[VIO_DRIVER_TX_RING];
-		if (unlikely(vdc_tx_dring_avail(dr) < 1))
-			goto wait;
+	blk_mq_start_request(bd->rq);
 
-		blk_start_request(req);
+	spin_lock_irqsave(&port->vio.lock, flags);
 
-		if (__send_request(req) < 0) {
-			blk_requeue_request(rq, req);
-wait:
-			/* Avoid pointless unplugs. */
-			blk_stop_queue(rq);
-			break;
-		}
+	/*
+	 * Doing drain, just end the request in error
+	 */
+	if (unlikely(port->drain)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
 	}
+
+	if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		blk_mq_stop_hw_queue(hctx);
+		return BLK_STS_DEV_RESOURCE;
+	}
+
+	if (__send_request(bd->rq) < 0) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
+	}
+
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+	return BLK_STS_OK;
 }
 
 static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
@@ -759,6 +770,32 @@ static void vdc_port_down(struct vdc_port *port)
 	vio_ldc_free(&port->vio);
 }
 
+static const struct blk_mq_ops vdc_mq_ops = {
+	.queue_rq	= vdc_queue_rq,
+};
+
+static void cleanup_queue(struct request_queue *q)
+{
+	struct vdc_port *port = q->queuedata;
+
+	blk_cleanup_queue(q);
+	blk_mq_free_tag_set(&port->tag_set);
+}
+
+static struct request_queue *init_queue(struct vdc_port *port)
+{
+	struct request_queue *q;
+	int ret;
+
+	q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
+					BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(q))
+		return q;
+
+	q->queuedata = port;
+	return q;
+}
+
 static int probe_disk(struct vdc_port *port)
 {
 	struct request_queue *q;
@@ -796,17 +833,17 @@ static int probe_disk(struct vdc_port *port)
 				    (u64)geom.num_sec);
 	}
 
-	q = blk_init_queue(do_vdc_request, &port->vio.lock);
-	if (!q) {
+	q = init_queue(port);
+	if (IS_ERR(q)) {
 		printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
 		       port->vio.name);
-		return -ENOMEM;
+		return PTR_ERR(q);
 	}
 	g = alloc_disk(1 << PARTITION_SHIFT);
 	if (!g) {
 		printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
 		       port->vio.name);
-		blk_cleanup_queue(q);
+		cleanup_queue(q);
 		return -ENOMEM;
 	}
 
@@ -981,7 +1018,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
 	 */
 	ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
 	port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
-	timer_setup(&port->ldc_reset_timer, vdc_ldc_reset_timer, 0);
+	INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work);
 	INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
 
 	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
@@ -1034,18 +1071,14 @@ static int vdc_port_remove(struct vio_dev *vdev)
 	struct vdc_port *port = dev_get_drvdata(&vdev->dev);
 
 	if (port) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&port->vio.lock, flags);
-		blk_stop_queue(port->disk->queue);
-		spin_unlock_irqrestore(&port->vio.lock, flags);
+		blk_mq_stop_hw_queues(port->disk->queue);
 
 		flush_work(&port->ldc_reset_work);
-		del_timer_sync(&port->ldc_reset_timer);
+		cancel_delayed_work_sync(&port->ldc_reset_timer_work);
 		del_timer_sync(&port->vio.timer);
 
 		del_gendisk(port->disk);
-		blk_cleanup_queue(port->disk->queue);
+		cleanup_queue(port->disk->queue);
 		put_disk(port->disk);
 		port->disk = NULL;
 
@@ -1080,32 +1113,46 @@ static void vdc_requeue_inflight(struct vdc_port *port)
 		}
 
 		rqe->req = NULL;
-		blk_requeue_request(port->disk->queue, req);
+		blk_mq_requeue_request(req, false);
 	}
 }
 
 static void vdc_queue_drain(struct vdc_port *port)
 {
-	struct request *req;
+	struct request_queue *q = port->disk->queue;
 
-	while ((req = blk_fetch_request(port->disk->queue)) != NULL)
-		__blk_end_request_all(req, BLK_STS_IOERR);
+	/*
+	 * Mark the queue as draining, then freeze/quiesce to ensure
+	 * that all existing requests are seen in ->queue_rq() and killed
+	 */
+	port->drain = 1;
+	spin_unlock_irq(&port->vio.lock);
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	spin_lock_irq(&port->vio.lock);
+	port->drain = 0;
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
 }
 
-static void vdc_ldc_reset_timer(struct timer_list *t)
+static void vdc_ldc_reset_timer_work(struct work_struct *work)
 {
-	struct vdc_port *port = from_timer(port, t, ldc_reset_timer);
-	struct vio_driver_state *vio = &port->vio;
-	unsigned long flags;
+	struct vdc_port *port;
+	struct vio_driver_state *vio;
 
-	spin_lock_irqsave(&vio->lock, flags);
+	port = container_of(work, struct vdc_port, ldc_reset_timer_work.work);
+	vio = &port->vio;
+
+	spin_lock_irq(&vio->lock);
 	if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
 		pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
 			port->disk_name, port->ldc_timeout);
 		vdc_queue_drain(port);
 		vdc_blk_queue_start(port);
 	}
-	spin_unlock_irqrestore(&vio->lock, flags);
+	spin_unlock_irq(&vio->lock);
 }
 
 static void vdc_ldc_reset_work(struct work_struct *work)
@@ -1129,7 +1176,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
 	assert_spin_locked(&port->vio.lock);
 
 	pr_warn(PFX "%s ldc link reset\n", port->disk_name);
-	blk_stop_queue(port->disk->queue);
+	blk_mq_stop_hw_queues(port->disk->queue);
 	vdc_requeue_inflight(port);
 	vdc_port_down(port);
 
@@ -1146,7 +1193,7 @@ static void vdc_ldc_reset(struct vdc_port *port)
 	}
 
 	if (port->ldc_timeout)
-		mod_timer(&port->ldc_reset_timer,
+		mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
 			  round_jiffies(jiffies + HZ * port->ldc_timeout));
 	mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
 	return;

From db1142a83b4caf73b6510d0efd6ef3ab7a508105 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 08:58:05 -0600
Subject: [PATCH 007/351] ms_block: convert to blk-mq

Straight forward conversion, room for optimization in how everything
is punted to a work queue. Also looks plenty racy all over the map,
with the state changes. I fixed a bunch of them up while doing the
conversion, but there are surely more.

Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/ms_block.c | 110 +++++++++++++++++--------------
 drivers/memstick/core/ms_block.h |   1 +
 2 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 8a02f11076f9..ee0be66a9a03 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -15,7 +15,7 @@
 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
 
 #include <linux/module.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/memstick.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
@@ -1873,69 +1873,65 @@ static void msb_io_work(struct work_struct *work)
 	struct msb_data *msb = container_of(work, struct msb_data, io_work);
 	int page, error, len;
 	sector_t lba;
-	unsigned long flags;
 	struct scatterlist *sg = msb->prealloc_sg;
+	struct request *req;
 
 	dbg_verbose("IO: work started");
 
 	while (1) {
-		spin_lock_irqsave(&msb->q_lock, flags);
+		spin_lock_irq(&msb->q_lock);
 
 		if (msb->need_flush_cache) {
 			msb->need_flush_cache = false;
-			spin_unlock_irqrestore(&msb->q_lock, flags);
+			spin_unlock_irq(&msb->q_lock);
 			msb_cache_flush(msb);
 			continue;
 		}
 
-		if (!msb->req) {
-			msb->req = blk_fetch_request(msb->queue);
-			if (!msb->req) {
-				dbg_verbose("IO: no more requests exiting");
-				spin_unlock_irqrestore(&msb->q_lock, flags);
-				return;
-			}
+		req = msb->req;
+		if (!req) {
+			dbg_verbose("IO: no more requests exiting");
+			spin_unlock_irq(&msb->q_lock);
+			return;
 		}
 
-		spin_unlock_irqrestore(&msb->q_lock, flags);
-
-		/* If card was removed meanwhile */
-		if (!msb->req)
-			return;
+		spin_unlock_irq(&msb->q_lock);
 
 		/* process the request */
 		dbg_verbose("IO: processing new request");
-		blk_rq_map_sg(msb->queue, msb->req, sg);
+		blk_rq_map_sg(msb->queue, req, sg);
 
-		lba = blk_rq_pos(msb->req);
+		lba = blk_rq_pos(req);
 
 		sector_div(lba, msb->page_size / 512);
 		page = sector_div(lba, msb->pages_in_block);
 
 		if (rq_data_dir(msb->req) == READ)
 			error = msb_do_read_request(msb, lba, page, sg,
-				blk_rq_bytes(msb->req), &len);
+				blk_rq_bytes(req), &len);
 		else
 			error = msb_do_write_request(msb, lba, page, sg,
-				blk_rq_bytes(msb->req), &len);
+				blk_rq_bytes(req), &len);
 
-		spin_lock_irqsave(&msb->q_lock, flags);
-
-		if (len)
-			if (!__blk_end_request(msb->req, BLK_STS_OK, len))
-				msb->req = NULL;
+		if (len && !blk_update_request(req, BLK_STS_OK, len)) {
+			__blk_mq_end_request(req, BLK_STS_OK);
+			spin_lock_irq(&msb->q_lock);
+			msb->req = NULL;
+			spin_unlock_irq(&msb->q_lock);
+		}
 
 		if (error && msb->req) {
 			blk_status_t ret = errno_to_blk_status(error);
+
 			dbg_verbose("IO: ending one sector of the request with error");
-			if (!__blk_end_request(msb->req, ret, msb->page_size))
-				msb->req = NULL;
+			blk_mq_end_request(req, ret);
+			spin_lock_irq(&msb->q_lock);
+			msb->req = NULL;
+			spin_unlock_irq(&msb->q_lock);
 		}
 
 		if (msb->req)
 			dbg_verbose("IO: request still pending");
-
-		spin_unlock_irqrestore(&msb->q_lock, flags);
 	}
 }
 
@@ -2002,29 +1998,40 @@ static int msb_bd_getgeo(struct block_device *bdev,
 	return 0;
 }
 
-static void msb_submit_req(struct request_queue *q)
+static blk_status_t msb_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
-	struct memstick_dev *card = q->queuedata;
+	struct memstick_dev *card = hctx->queue->queuedata;
 	struct msb_data *msb = memstick_get_drvdata(card);
-	struct request *req = NULL;
+	struct request *req = bd->rq;
 
 	dbg_verbose("Submit request");
 
+	spin_lock_irq(&msb->q_lock);
+
 	if (msb->card_dead) {
 		dbg("Refusing requests on removed card");
 
 		WARN_ON(!msb->io_queue_stopped);
 
-		while ((req = blk_fetch_request(q)) != NULL)
-			__blk_end_request_all(req, BLK_STS_IOERR);
-		return;
+		spin_unlock_irq(&msb->q_lock);
+		blk_mq_start_request(req);
+		return BLK_STS_IOERR;
 	}
 
-	if (msb->req)
-		return;
+	if (msb->req) {
+		spin_unlock_irq(&msb->q_lock);
+		return BLK_STS_DEV_RESOURCE;
+	}
+
+	blk_mq_start_request(req);
+	msb->req = req;
 
 	if (!msb->io_queue_stopped)
 		queue_work(msb->io_queue, &msb->io_work);
+
+	spin_unlock_irq(&msb->q_lock);
+	return BLK_STS_OK;
 }
 
 static int msb_check_card(struct memstick_dev *card)
@@ -2040,21 +2047,20 @@ static void msb_stop(struct memstick_dev *card)
 
 	dbg("Stopping all msblock IO");
 
+	blk_mq_stop_hw_queues(msb->queue);
 	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_stop_queue(msb->queue);
 	msb->io_queue_stopped = true;
 	spin_unlock_irqrestore(&msb->q_lock, flags);
 
 	del_timer_sync(&msb->cache_flush_timer);
 	flush_workqueue(msb->io_queue);
 
+	spin_lock_irqsave(&msb->q_lock, flags);
 	if (msb->req) {
-		spin_lock_irqsave(&msb->q_lock, flags);
-		blk_requeue_request(msb->queue, msb->req);
+		blk_mq_requeue_request(msb->req, false);
 		msb->req = NULL;
-		spin_unlock_irqrestore(&msb->q_lock, flags);
 	}
-
+	spin_unlock_irqrestore(&msb->q_lock, flags);
 }
 
 static void msb_start(struct memstick_dev *card)
@@ -2077,9 +2083,7 @@ static void msb_start(struct memstick_dev *card)
 	msb->need_flush_cache = true;
 	msb->io_queue_stopped = false;
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	queue_work(msb->io_queue, &msb->io_work);
 
@@ -2092,10 +2096,15 @@ static const struct block_device_operations msb_bdops = {
 	.owner   = THIS_MODULE
 };
 
+static const struct blk_mq_ops msb_mq_ops = {
+	.queue_rq	= msb_queue_rq,
+};
+
 /* Registers the block device */
 static int msb_init_disk(struct memstick_dev *card)
 {
 	struct msb_data *msb = memstick_get_drvdata(card);
+	struct blk_mq_tag_set *set = NULL;
 	int rc;
 	unsigned long capacity;
 
@@ -2112,9 +2121,11 @@ static int msb_init_disk(struct memstick_dev *card)
 		goto out_release_id;
 	}
 
-	msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock);
-	if (!msb->queue) {
-		rc = -ENOMEM;
+	msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &msb_mq_ops, 2,
+						BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(msb->queue)) {
+		rc = PTR_ERR(msb->queue);
+		msb->queue = NULL;
 		goto out_put_disk;
 	}
 
@@ -2202,12 +2213,13 @@ static void msb_remove(struct memstick_dev *card)
 	/* Take care of unhandled + new requests from now on */
 	spin_lock_irqsave(&msb->q_lock, flags);
 	msb->card_dead = true;
-	blk_start_queue(msb->queue);
 	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	/* Remove the disk */
 	del_gendisk(msb->disk);
 	blk_cleanup_queue(msb->queue);
+	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
 	mutex_lock(&msb_disk_lock);
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
index 53962c3b21df..9ba84e0ced63 100644
--- a/drivers/memstick/core/ms_block.h
+++ b/drivers/memstick/core/ms_block.h
@@ -152,6 +152,7 @@ struct msb_data {
 	struct gendisk			*disk;
 	struct request_queue		*queue;
 	spinlock_t			q_lock;
+	struct blk_mq_tag_set		tag_set;
 	struct hd_geometry		geometry;
 	struct attribute_group		attr_group;
 	struct request			*req;

From d0be12274dad242271fb2055275d10b67a0d7649 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 15 Oct 2018 09:00:02 -0600
Subject: [PATCH 008/351] mspro_block: convert to blk-mq

Straight forward conversion, there's room for improvement.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/mspro_block.c | 123 +++++++++++++++-------------
 1 file changed, 67 insertions(+), 56 deletions(-)

diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 0cd30dcb6801..aba50ec98b4d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -12,7 +12,7 @@
  *
  */
 
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/kthread.h>
@@ -142,6 +142,7 @@ struct mspro_block_data {
 	struct gendisk        *disk;
 	struct request_queue  *queue;
 	struct request        *block_req;
+	struct blk_mq_tag_set tag_set;
 	spinlock_t            q_lock;
 
 	unsigned short        page_size;
@@ -152,7 +153,6 @@ struct mspro_block_data {
 	unsigned char         system;
 	unsigned char         read_only:1,
 			      eject:1,
-			      has_request:1,
 			      data_dir:1,
 			      active:1;
 	unsigned char         transfer_cmd;
@@ -694,13 +694,12 @@ static void h_mspro_block_setup_cmd(struct memstick_dev *card, u64 offset,
 
 /*** Data transfer ***/
 
-static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
+static int mspro_block_issue_req(struct memstick_dev *card, bool chunk)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
 	u64 t_off;
 	unsigned int count;
 
-try_again:
 	while (chunk) {
 		msb->current_page = 0;
 		msb->current_seg = 0;
@@ -709,9 +708,17 @@ try_again:
 					       msb->req_sg);
 
 		if (!msb->seg_count) {
-			chunk = __blk_end_request_cur(msb->block_req,
-					BLK_STS_RESOURCE);
-			continue;
+			unsigned int bytes = blk_rq_cur_bytes(msb->block_req);
+
+			chunk = blk_update_request(msb->block_req,
+							BLK_STS_RESOURCE,
+							bytes);
+			if (chunk)
+				continue;
+			__blk_mq_end_request(msb->block_req,
+						BLK_STS_RESOURCE);
+			msb->block_req = NULL;
+			break;
 		}
 
 		t_off = blk_rq_pos(msb->block_req);
@@ -729,30 +736,22 @@ try_again:
 		return 0;
 	}
 
-	dev_dbg(&card->dev, "blk_fetch\n");
-	msb->block_req = blk_fetch_request(msb->queue);
-	if (!msb->block_req) {
-		dev_dbg(&card->dev, "issue end\n");
-		return -EAGAIN;
-	}
-
-	dev_dbg(&card->dev, "trying again\n");
-	chunk = 1;
-	goto try_again;
+	return 1;
 }
 
 static int mspro_block_complete_req(struct memstick_dev *card, int error)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	int chunk, cnt;
+	int cnt;
+	bool chunk;
 	unsigned int t_len = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&msb->q_lock, flags);
-	dev_dbg(&card->dev, "complete %d, %d\n", msb->has_request ? 1 : 0,
+	dev_dbg(&card->dev, "complete %d, %d\n", msb->block_req ? 1 : 0,
 		error);
 
-	if (msb->has_request) {
+	if (msb->block_req) {
 		/* Nothing to do - not really an error */
 		if (error == -EAGAIN)
 			error = 0;
@@ -777,15 +776,17 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 		if (error && !t_len)
 			t_len = blk_rq_cur_bytes(msb->block_req);
 
-		chunk = __blk_end_request(msb->block_req,
+		chunk = blk_update_request(msb->block_req,
 				errno_to_blk_status(error), t_len);
-
-		error = mspro_block_issue_req(card, chunk);
-
-		if (!error)
-			goto out;
-		else
-			msb->has_request = 0;
+		if (chunk) {
+			error = mspro_block_issue_req(card, chunk);
+			if (!error)
+				goto out;
+		} else {
+			__blk_mq_end_request(msb->block_req,
+						errno_to_blk_status(error));
+			msb->block_req = NULL;
+		}
 	} else {
 		if (!error)
 			error = -EAGAIN;
@@ -806,8 +807,8 @@ static void mspro_block_stop(struct memstick_dev *card)
 
 	while (1) {
 		spin_lock_irqsave(&msb->q_lock, flags);
-		if (!msb->has_request) {
-			blk_stop_queue(msb->queue);
+		if (!msb->block_req) {
+			blk_mq_stop_hw_queues(msb->queue);
 			rc = 1;
 		}
 		spin_unlock_irqrestore(&msb->q_lock, flags);
@@ -822,32 +823,37 @@ static void mspro_block_stop(struct memstick_dev *card)
 static void mspro_block_start(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	unsigned long flags;
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 }
 
-static void mspro_block_submit_req(struct request_queue *q)
+static blk_status_t mspro_queue_rq(struct blk_mq_hw_ctx *hctx,
+				   const struct blk_mq_queue_data *bd)
 {
-	struct memstick_dev *card = q->queuedata;
+	struct memstick_dev *card = hctx->queue->queuedata;
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	struct request *req = NULL;
 
-	if (msb->has_request)
-		return;
+	spin_lock_irq(&msb->q_lock);
 
-	if (msb->eject) {
-		while ((req = blk_fetch_request(q)) != NULL)
-			__blk_end_request_all(req, BLK_STS_IOERR);
-
-		return;
+	if (msb->block_req) {
+		spin_unlock_irq(&msb->q_lock);
+		return BLK_STS_DEV_RESOURCE;
 	}
 
-	msb->has_request = 1;
-	if (mspro_block_issue_req(card, 0))
-		msb->has_request = 0;
+	if (msb->eject) {
+		spin_unlock_irq(&msb->q_lock);
+		blk_mq_start_request(bd->rq);
+		return BLK_STS_IOERR;
+	}
+
+	msb->block_req = bd->rq;
+	blk_mq_start_request(bd->rq);
+
+	if (mspro_block_issue_req(card, true))
+		msb->block_req = NULL;
+
+	spin_unlock_irq(&msb->q_lock);
+	return BLK_STS_OK;
 }
 
 /*** Initialization ***/
@@ -1167,6 +1173,10 @@ static int mspro_block_init_card(struct memstick_dev *card)
 
 }
 
+static const struct blk_mq_ops mspro_mq_ops = {
+	.queue_rq	= mspro_queue_rq,
+};
+
 static int mspro_block_init_disk(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
@@ -1206,9 +1216,11 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 		goto out_release_id;
 	}
 
-	msb->queue = blk_init_queue(mspro_block_submit_req, &msb->q_lock);
-	if (!msb->queue) {
-		rc = -ENOMEM;
+	msb->queue = blk_mq_init_sq_queue(&msb->tag_set, &mspro_mq_ops, 2,
+						BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(msb->queue)) {
+		rc = PTR_ERR(msb->queue);
+		msb->queue = NULL;
 		goto out_put_disk;
 	}
 
@@ -1318,13 +1330,14 @@ static void mspro_block_remove(struct memstick_dev *card)
 
 	spin_lock_irqsave(&msb->q_lock, flags);
 	msb->eject = 1;
-	blk_start_queue(msb->queue);
 	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 
 	del_gendisk(msb->disk);
 	dev_dbg(&card->dev, "mspro block remove\n");
 
 	blk_cleanup_queue(msb->queue);
+	blk_mq_free_tag_set(&msb->tag_set);
 	msb->queue = NULL;
 
 	sysfs_remove_group(&card->dev.kobj, &msb->attr_group);
@@ -1344,8 +1357,9 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
 	unsigned long flags;
 
+	blk_mq_stop_hw_queues(msb->queue);
+
 	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_stop_queue(msb->queue);
 	msb->active = 0;
 	spin_unlock_irqrestore(&msb->q_lock, flags);
 
@@ -1355,7 +1369,6 @@ static int mspro_block_suspend(struct memstick_dev *card, pm_message_t state)
 static int mspro_block_resume(struct memstick_dev *card)
 {
 	struct mspro_block_data *msb = memstick_get_drvdata(card);
-	unsigned long flags;
 	int rc = 0;
 
 #ifdef CONFIG_MEMSTICK_UNSAFE_RESUME
@@ -1401,9 +1414,7 @@ out_unlock:
 
 #endif /* CONFIG_MEMSTICK_UNSAFE_RESUME */
 
-	spin_lock_irqsave(&msb->q_lock, flags);
-	blk_start_queue(msb->queue);
-	spin_unlock_irqrestore(&msb->q_lock, flags);
+	blk_mq_start_hw_queues(msb->queue);
 	return rc;
 }
 

From 600335205b8d162891b5ef2e32343f5b8020efd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 09:53:52 -0600
Subject: [PATCH 009/351] ide: convert to blk-mq

ide-disk and ide-cd tested as working just fine, ide-tape and
ide-floppy haven't. But the latter don't require changes, so they
should work without issue.

Add helper function to insert a request from a work queue, since we
cannot invoke the blk-mq request insertion from IRQ context.

Cc: David Miller <davem@davemloft.net>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c |  25 ++++--
 drivers/ide/ide-cd.c    | 175 +++++++++++++++++++++-------------------
 drivers/ide/ide-disk.c  |   5 +-
 drivers/ide/ide-io.c    | 100 +++++++++++++----------
 drivers/ide/ide-park.c  |   4 +-
 drivers/ide/ide-pm.c    |  28 ++-----
 drivers/ide/ide-probe.c |  68 +++++++++++-----
 include/linux/ide.h     |  13 ++-
 8 files changed, 239 insertions(+), 179 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 8b2b72b93885..33210bc67618 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 {
 	struct request_sense *sense = &drive->sense_data;
-	struct request *sense_rq = drive->sense_rq;
-	struct scsi_request *req = scsi_req(sense_rq);
+	struct request *sense_rq;
+	struct scsi_request *req;
 	unsigned int cmd_len, sense_len;
 	int err;
 
@@ -196,9 +196,16 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	if (ata_sense_request(rq) || drive->sense_rq_armed)
 		return;
 
+	sense_rq = drive->sense_rq;
+	if (!sense_rq) {
+		sense_rq = blk_mq_alloc_request(drive->queue, REQ_OP_DRV_IN,
+					BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT);
+		drive->sense_rq = sense_rq;
+	}
+	req = scsi_req(sense_rq);
+
 	memset(sense, 0, sizeof(*sense));
 
-	blk_rq_init(rq->q, sense_rq);
 	scsi_req_init(req);
 
 	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
@@ -207,6 +214,8 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 		if (printk_ratelimit())
 			printk(KERN_WARNING PFX "%s: failed to map sense "
 					    "buffer\n", drive->name);
+		blk_mq_free_request(sense_rq);
+		drive->sense_rq = NULL;
 		return;
 	}
 
@@ -226,6 +235,8 @@ EXPORT_SYMBOL_GPL(ide_prep_sense);
 
 int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
+	struct request *sense_rq = drive->sense_rq;
+
 	/* deferred failure from ide_prep_sense() */
 	if (!drive->sense_rq_armed) {
 		printk(KERN_WARNING PFX "%s: error queuing a sense request\n",
@@ -233,12 +244,12 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 		return -ENOMEM;
 	}
 
-	drive->sense_rq->special = special;
+	sense_rq->special = special;
 	drive->sense_rq_armed = false;
 
 	drive->hwif->rq = NULL;
 
-	elv_add_request(drive->queue, drive->sense_rq, ELEVATOR_INSERT_FRONT);
+	ide_insert_request_head(drive, sense_rq);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
@@ -270,10 +281,8 @@ void ide_retry_pc(ide_drive_t *drive)
 	 */
 	drive->hwif->rq = NULL;
 	ide_requeue_and_plug(drive, failed_rq);
-	if (ide_queue_sense_rq(drive, pc)) {
-		blk_start_request(failed_rq);
+	if (ide_queue_sense_rq(drive, pc))
 		ide_complete_rq(drive, BLK_STS_IOERR, blk_rq_bytes(failed_rq));
-	}
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index f9b59d41813f..4ecaf2ace4cb 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -258,11 +258,22 @@ static int ide_cd_breathe(ide_drive_t *drive, struct request *rq)
 		/*
 		 * take a breather
 		 */
-		blk_delay_queue(drive->queue, 1);
+		blk_mq_requeue_request(rq, false);
+		blk_mq_delay_kick_requeue_list(drive->queue, 1);
 		return 1;
 	}
 }
 
+static void ide_cd_free_sense(ide_drive_t *drive)
+{
+	if (!drive->sense_rq)
+		return;
+
+	blk_mq_free_request(drive->sense_rq);
+	drive->sense_rq = NULL;
+	drive->sense_rq_armed = false;
+}
+
 /**
  * Returns:
  * 0: if the request should be continued.
@@ -516,6 +527,82 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 	return false;
 }
 
+/* standard prep_rq_fn that builds 10 byte cmds */
+static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
+{
+	int hard_sect = queue_logical_block_size(q);
+	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
+	struct scsi_request *req = scsi_req(rq);
+
+	if (rq_data_dir(rq) == READ)
+		req->cmd[0] = GPCMD_READ_10;
+	else
+		req->cmd[0] = GPCMD_WRITE_10;
+
+	/*
+	 * fill in lba
+	 */
+	req->cmd[2] = (block >> 24) & 0xff;
+	req->cmd[3] = (block >> 16) & 0xff;
+	req->cmd[4] = (block >>  8) & 0xff;
+	req->cmd[5] = block & 0xff;
+
+	/*
+	 * and transfer length
+	 */
+	req->cmd[7] = (blocks >> 8) & 0xff;
+	req->cmd[8] = blocks & 0xff;
+	req->cmd_len = 10;
+	return BLKPREP_OK;
+}
+
+/*
+ * Most of the SCSI commands are supported directly by ATAPI devices.
+ * This transform handles the few exceptions.
+ */
+static int ide_cdrom_prep_pc(struct request *rq)
+{
+	u8 *c = scsi_req(rq)->cmd;
+
+	/* transform 6-byte read/write commands to the 10-byte version */
+	if (c[0] == READ_6 || c[0] == WRITE_6) {
+		c[8] = c[4];
+		c[5] = c[3];
+		c[4] = c[2];
+		c[3] = c[1] & 0x1f;
+		c[2] = 0;
+		c[1] &= 0xe0;
+		c[0] += (READ_10 - READ_6);
+		scsi_req(rq)->cmd_len = 10;
+		return BLKPREP_OK;
+	}
+
+	/*
+	 * it's silly to pretend we understand 6-byte sense commands, just
+	 * reject with ILLEGAL_REQUEST and the caller should take the
+	 * appropriate action
+	 */
+	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
+		scsi_req(rq)->result = ILLEGAL_REQUEST;
+		return BLKPREP_KILL;
+	}
+
+	return BLKPREP_OK;
+}
+
+static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
+{
+	if (!blk_rq_is_passthrough(rq)) {
+		scsi_req_init(scsi_req(rq));
+
+		return ide_cdrom_prep_fs(drive->queue, rq);
+	} else if (blk_rq_is_scsi(rq))
+		return ide_cdrom_prep_pc(rq);
+
+	return 0;
+}
+
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 {
 	ide_hwif_t *hwif = drive->hwif;
@@ -675,7 +762,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 out_end:
 	if (blk_rq_is_scsi(rq) && rc == 0) {
 		scsi_req(rq)->resid_len = 0;
-		blk_end_request_all(rq, BLK_STS_OK);
+		blk_mq_end_request(rq, BLK_STS_OK);
 		hwif->rq = NULL;
 	} else {
 		if (sense && uptodate)
@@ -705,6 +792,8 @@ out_end:
 		if (sense && rc == 2)
 			ide_error(drive, "request sense failure", stat);
 	}
+
+	ide_cd_free_sense(drive);
 	return ide_stopped;
 }
 
@@ -729,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		q->prep_rq_fn(q, rq);
+		ide_cdrom_prep_fn(drive, rq);
 	}
 
 	/* fs requests *must* be hardware frame aligned */
@@ -1323,82 +1412,6 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 	return nslots;
 }
 
-/* standard prep_rq_fn that builds 10 byte cmds */
-static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
-{
-	int hard_sect = queue_logical_block_size(q);
-	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
-	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
-	struct scsi_request *req = scsi_req(rq);
-
-	q->initialize_rq_fn(rq);
-
-	if (rq_data_dir(rq) == READ)
-		req->cmd[0] = GPCMD_READ_10;
-	else
-		req->cmd[0] = GPCMD_WRITE_10;
-
-	/*
-	 * fill in lba
-	 */
-	req->cmd[2] = (block >> 24) & 0xff;
-	req->cmd[3] = (block >> 16) & 0xff;
-	req->cmd[4] = (block >>  8) & 0xff;
-	req->cmd[5] = block & 0xff;
-
-	/*
-	 * and transfer length
-	 */
-	req->cmd[7] = (blocks >> 8) & 0xff;
-	req->cmd[8] = blocks & 0xff;
-	req->cmd_len = 10;
-	return BLKPREP_OK;
-}
-
-/*
- * Most of the SCSI commands are supported directly by ATAPI devices.
- * This transform handles the few exceptions.
- */
-static int ide_cdrom_prep_pc(struct request *rq)
-{
-	u8 *c = scsi_req(rq)->cmd;
-
-	/* transform 6-byte read/write commands to the 10-byte version */
-	if (c[0] == READ_6 || c[0] == WRITE_6) {
-		c[8] = c[4];
-		c[5] = c[3];
-		c[4] = c[2];
-		c[3] = c[1] & 0x1f;
-		c[2] = 0;
-		c[1] &= 0xe0;
-		c[0] += (READ_10 - READ_6);
-		scsi_req(rq)->cmd_len = 10;
-		return BLKPREP_OK;
-	}
-
-	/*
-	 * it's silly to pretend we understand 6-byte sense commands, just
-	 * reject with ILLEGAL_REQUEST and the caller should take the
-	 * appropriate action
-	 */
-	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
-		scsi_req(rq)->result = ILLEGAL_REQUEST;
-		return BLKPREP_KILL;
-	}
-
-	return BLKPREP_OK;
-}
-
-static int ide_cdrom_prep_fn(struct request_queue *q, struct request *rq)
-{
-	if (!blk_rq_is_passthrough(rq))
-		return ide_cdrom_prep_fs(q, rq);
-	else if (blk_rq_is_scsi(rq))
-		return ide_cdrom_prep_pc(rq);
-
-	return 0;
-}
-
 struct cd_list_entry {
 	const char	*id_model;
 	const char	*id_firmware;
@@ -1508,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	ide_debug_log(IDE_DBG_PROBE, "enter");
 
-	blk_queue_prep_rq(q, ide_cdrom_prep_fn);
+	drive->prep_rq = ide_cdrom_prep_fn;
 	blk_queue_dma_alignment(q, 31);
 	blk_queue_update_dma_pad(q, 15);
 
@@ -1569,7 +1582,7 @@ static void ide_cd_release(struct device *dev)
 	if (devinfo->handle == drive)
 		unregister_cdrom(devinfo);
 	drive->driver_data = NULL;
-	blk_queue_prep_rq(drive->queue, NULL);
+	drive->prep_rq = NULL;
 	g->private_data = NULL;
 	put_disk(g);
 	kfree(info);
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index e3b4e659082d..f8567c8c9dd1 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,9 +427,8 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
-static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
+static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
 {
-	ide_drive_t *drive = q->queuedata;
 	struct ide_cmd *cmd;
 
 	if (req_op(rq) != REQ_OP_FLUSH)
@@ -548,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
 
 		if (barrier) {
 			wc = true;
-			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
+			drive->prep_rq = idedisk_prep_fn;
 		}
 	}
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 0d93e0cfbeaf..5093c605c91c 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -67,7 +67,15 @@ int ide_end_rq(ide_drive_t *drive, struct request *rq, blk_status_t error,
 		ide_dma_on(drive);
 	}
 
-	return blk_end_request(rq, error, nr_bytes);
+	if (!blk_update_request(rq, error, nr_bytes)) {
+		if (rq == drive->sense_rq)
+			drive->sense_rq = NULL;
+
+		__blk_mq_end_request(rq, error);
+		return 0;
+	}
+
+	return 1;
 }
 EXPORT_SYMBOL_GPL(ide_end_rq);
 
@@ -307,8 +315,6 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 {
 	ide_startstop_t startstop;
 
-	BUG_ON(!(rq->rq_flags & RQF_STARTED));
-
 #ifdef DEBUG
 	printk("%s: start_request: current=0x%08lx\n",
 		drive->hwif->name, (unsigned long) rq);
@@ -320,6 +326,9 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
+	if (drive->prep_rq && drive->prep_rq(drive, rq))
+		return ide_stopped;
+
 	if (ata_pm_request(rq))
 		ide_check_pm_state(drive, rq);
 
@@ -430,44 +439,38 @@ static inline void ide_unlock_host(struct ide_host *host)
 	}
 }
 
-static void __ide_requeue_and_plug(struct request_queue *q, struct request *rq)
-{
-	if (rq)
-		blk_requeue_request(q, rq);
-	if (rq || blk_peek_request(q)) {
-		/* Use 3ms as that was the old plug delay */
-		blk_delay_queue(q, 3);
-	}
-}
-
 void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	__ide_requeue_and_plug(q, rq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	/* Use 3ms as that was the old plug delay */
+	if (rq) {
+		blk_mq_requeue_request(rq, false);
+		blk_mq_delay_kick_requeue_list(q, 3);
+	} else
+		blk_mq_delay_run_hw_queue(q->queue_hw_ctx[0], 3);
 }
 
 /*
  * Issue a new request to a device.
  */
-void do_ide_request(struct request_queue *q)
+blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
 {
-	ide_drive_t	*drive = q->queuedata;
+	ide_drive_t	*drive = hctx->queue->queuedata;
 	ide_hwif_t	*hwif = drive->hwif;
 	struct ide_host *host = hwif->host;
 	struct request	*rq = NULL;
 	ide_startstop_t	startstop;
 
-	spin_unlock_irq(q->queue_lock);
-
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
 	if (ide_lock_host(host, hwif))
-		goto plug_device_2;
+		return BLK_STS_DEV_RESOURCE;
+
+	rq = bd->rq;
+	blk_mq_start_request(rq);
 
 	spin_lock_irq(&hwif->lock);
 
@@ -503,21 +506,16 @@ repeat:
 		hwif->cur_dev = drive;
 		drive->dev_flags &= ~(IDE_DFLAG_SLEEPING | IDE_DFLAG_PARKED);
 
-		spin_unlock_irq(&hwif->lock);
-		spin_lock_irq(q->queue_lock);
 		/*
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		if (!rq)
-			rq = blk_fetch_request(drive->queue);
-
-		spin_unlock_irq(q->queue_lock);
-		spin_lock_irq(&hwif->lock);
-
 		if (!rq) {
-			ide_unlock_port(hwif);
-			goto out;
+			rq = bd->rq;
+			if (!rq) {
+				ide_unlock_port(hwif);
+				goto out;
+			}
 		}
 
 		/*
@@ -551,23 +549,24 @@ repeat:
 		if (startstop == ide_stopped) {
 			rq = hwif->rq;
 			hwif->rq = NULL;
-			goto repeat;
+			if (rq)
+				goto repeat;
+			ide_unlock_port(hwif);
+			goto out;
 		}
-	} else
-		goto plug_device;
+	} else {
+plug_device:
+		spin_unlock_irq(&hwif->lock);
+		ide_unlock_host(host);
+		ide_requeue_and_plug(drive, rq);
+		return BLK_STS_OK;
+	}
+
 out:
 	spin_unlock_irq(&hwif->lock);
 	if (rq == NULL)
 		ide_unlock_host(host);
-	spin_lock_irq(q->queue_lock);
-	return;
-
-plug_device:
-	spin_unlock_irq(&hwif->lock);
-	ide_unlock_host(host);
-plug_device_2:
-	spin_lock_irq(q->queue_lock);
-	__ide_requeue_and_plug(q, rq);
+	return BLK_STS_OK;
 }
 
 static int drive_is_ready(ide_drive_t *drive)
@@ -887,3 +886,16 @@ void ide_pad_transfer(ide_drive_t *drive, int write, int len)
 	}
 }
 EXPORT_SYMBOL_GPL(ide_pad_transfer);
+
+void ide_insert_request_head(ide_drive_t *drive, struct request *rq)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	unsigned long flags;
+
+	spin_lock_irqsave(&hwif->lock, flags);
+	list_add_tail(&rq->queuelist, &drive->rq_list);
+	spin_unlock_irqrestore(&hwif->lock, flags);
+
+	kblockd_schedule_work(&drive->rq_work);
+}
+EXPORT_SYMBOL_GPL(ide_insert_request_head);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 622f0edb3945..de9e85cf74d1 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -27,7 +27,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 		spin_unlock_irq(&hwif->lock);
 
 		if (start_queue)
-			blk_run_queue(q);
+			blk_mq_run_hw_queues(q, true);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
@@ -54,7 +54,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_UNPARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
+	ide_insert_request_head(drive, rq);
 
 out:
 	return;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 59217aa1d1fb..ea10507e5190 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -40,32 +40,20 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	return ret;
 }
 
-static void ide_end_sync_rq(struct request *rq, blk_status_t error)
-{
-	complete(rq->end_io_data);
-}
-
 static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	DECLARE_COMPLETION_ONSTACK(wait);
-
-	rq->end_io_data = &wait;
-	rq->end_io = ide_end_sync_rq;
 
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		__blk_end_request_all(rq, BLK_STS_OK);
 		spin_unlock_irq(q->queue_lock);
+		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	__elv_add_request(q, rq, ELEVATOR_INSERT_FRONT);
-	__blk_run_queue_uncond(q);
 	spin_unlock_irq(q->queue_lock);
-
-	wait_for_completion_io(&wait);
+	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
 }
@@ -79,6 +67,8 @@ int generic_ide_resume(struct device *dev)
 	struct ide_pm_state rqpm;
 	int err;
 
+	blk_mq_start_stopped_hw_queues(drive->queue, true);
+
 	if (ide_port_acpi(hwif)) {
 		/* call ACPI _PS0 / _STM only once */
 		if ((drive->dn & 1) == 0 || pair == NULL) {
@@ -226,15 +216,14 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
-		blk_stop_queue(q);
+		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 
-	if (blk_end_request(rq, BLK_STS_OK, 0))
-		BUG();
+	blk_mq_end_request(rq, BLK_STS_OK);
 }
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
@@ -260,7 +249,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		ide_hwif_t *hwif = drive->hwif;
 		const struct ide_tp_ops *tp_ops = hwif->tp_ops;
 		struct request_queue *q = drive->queue;
-		unsigned long flags;
 		int rc;
 #ifdef DEBUG_PM
 		printk("%s: Wakeup request inited, waiting for !BSY...\n", drive->name);
@@ -274,8 +262,6 @@ void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 		if (rc)
 			printk(KERN_WARNING "%s: drive not ready on wakeup\n", drive->name);
 
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		blk_mq_start_hw_queues(q);
 	}
 }
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 3b75a7b7a284..40384838e439 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -750,6 +750,11 @@ static void ide_initialize_rq(struct request *rq)
 	req->sreq.sense = req->sense;
 }
 
+static const struct blk_mq_ops ide_mq_ops = {
+	.queue_rq		= ide_queue_rq,
+	.initialize_rq_fn	= ide_initialize_rq,
+};
+
 /*
  * init request queue
  */
@@ -759,6 +764,7 @@ static int ide_init_queue(ide_drive_t *drive)
 	ide_hwif_t *hwif = drive->hwif;
 	int max_sectors = 256;
 	int max_sg_entries = PRD_ENTRIES;
+	struct blk_mq_tag_set *set;
 
 	/*
 	 *	Our default set up assumes the normal IDE case,
@@ -767,19 +773,26 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	limits and LBA48 we could raise it but as yet
 	 *	do not.
 	 */
-	q = blk_alloc_queue_node(GFP_KERNEL, hwif_to_node(hwif), NULL);
-	if (!q)
+
+	set = &drive->tag_set;
+	set->ops = &ide_mq_ops;
+	set->nr_hw_queues = 1;
+	set->queue_depth = 32;
+	set->reserved_tags = 1;
+	set->cmd_size = sizeof(struct ide_request);
+	set->numa_node = hwif_to_node(hwif);
+	set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+	if (blk_mq_alloc_tag_set(set))
 		return 1;
 
-	q->request_fn = do_ide_request;
-	q->initialize_rq_fn = ide_initialize_rq;
-	q->cmd_size = sizeof(struct ide_request);
-	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
-	if (blk_init_allocated_queue(q) < 0) {
-		blk_cleanup_queue(q);
+	q = blk_mq_init_queue(set);
+	if (IS_ERR(q)) {
+		blk_mq_free_tag_set(set);
 		return 1;
 	}
 
+	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
+
 	q->queuedata = drive;
 	blk_queue_segment_boundary(q, 0xffff);
 
@@ -965,8 +978,12 @@ static void drive_release_dev (struct device *dev)
 
 	ide_proc_unregister_device(drive);
 
+	if (drive->sense_rq)
+		blk_mq_free_request(drive->sense_rq);
+
 	blk_cleanup_queue(drive->queue);
 	drive->queue = NULL;
+	blk_mq_free_tag_set(&drive->tag_set);
 
 	drive->dev_flags &= ~IDE_DFLAG_PRESENT;
 
@@ -1133,6 +1150,28 @@ static void ide_port_cable_detect(ide_hwif_t *hwif)
 	}
 }
 
+/*
+ * Deferred request list insertion handler
+ */
+static void drive_rq_insert_work(struct work_struct *work)
+{
+	ide_drive_t *drive = container_of(work, ide_drive_t, rq_work);
+	ide_hwif_t *hwif = drive->hwif;
+	struct request *rq;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&hwif->lock);
+	if (!list_empty(&drive->rq_list))
+		list_splice_init(&drive->rq_list, &list);
+	spin_unlock_irq(&hwif->lock);
+
+	while (!list_empty(&list)) {
+		rq = list_first_entry(&list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		blk_execute_rq_nowait(drive->queue, rq->rq_disk, rq, true, NULL);
+	}
+}
+
 static const u8 ide_hwif_to_major[] =
 	{ IDE0_MAJOR, IDE1_MAJOR, IDE2_MAJOR, IDE3_MAJOR, IDE4_MAJOR,
 	  IDE5_MAJOR, IDE6_MAJOR, IDE7_MAJOR, IDE8_MAJOR, IDE9_MAJOR };
@@ -1145,12 +1184,10 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 	ide_port_for_each_dev(i, drive, hwif) {
 		u8 j = (hwif->index * MAX_DRIVES) + i;
 		u16 *saved_id = drive->id;
-		struct request *saved_sense_rq = drive->sense_rq;
 
 		memset(drive, 0, sizeof(*drive));
 		memset(saved_id, 0, SECTOR_SIZE);
 		drive->id = saved_id;
-		drive->sense_rq = saved_sense_rq;
 
 		drive->media			= ide_disk;
 		drive->select			= (i << 4) | ATA_DEVICE_OBS;
@@ -1166,6 +1203,9 @@ static void ide_port_init_devices_data(ide_hwif_t *hwif)
 
 		INIT_LIST_HEAD(&drive->list);
 		init_completion(&drive->gendev_rel_comp);
+
+		INIT_WORK(&drive->rq_work, drive_rq_insert_work);
+		INIT_LIST_HEAD(&drive->rq_list);
 	}
 }
 
@@ -1255,7 +1295,6 @@ static void ide_port_free_devices(ide_hwif_t *hwif)
 	int i;
 
 	ide_port_for_each_dev(i, drive, hwif) {
-		kfree(drive->sense_rq);
 		kfree(drive->id);
 		kfree(drive);
 	}
@@ -1283,17 +1322,10 @@ static int ide_port_alloc_devices(ide_hwif_t *hwif, int node)
 		if (drive->id == NULL)
 			goto out_free_drive;
 
-		drive->sense_rq = kmalloc(sizeof(struct request) +
-				sizeof(struct ide_request), GFP_KERNEL);
-		if (!drive->sense_rq)
-			goto out_free_id;
-
 		hwif->devices[i] = drive;
 	}
 	return 0;
 
-out_free_id:
-	kfree(drive->id);
 out_free_drive:
 	kfree(drive);
 out_nomem:
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c74b0321922a..079f8bc0b0f4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -10,7 +10,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/ata.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
@@ -529,6 +529,10 @@ struct ide_drive_s {
 
 	struct request_queue	*queue;	/* request queue */
 
+	int (*prep_rq)(struct ide_drive_s *, struct request *);
+
+	struct blk_mq_tag_set	tag_set;
+
 	struct request		*rq;	/* current request */
 	void		*driver_data;	/* extra driver data */
 	u16			*id;	/* identification info */
@@ -612,6 +616,10 @@ struct ide_drive_s {
 	bool sense_rq_armed;
 	struct request *sense_rq;
 	struct request_sense sense_data;
+
+	/* async sense insertion */
+	struct work_struct rq_work;
+	struct list_head rq_list;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1089,6 +1097,7 @@ extern int ide_pci_clk;
 
 int ide_end_rq(ide_drive_t *, struct request *, blk_status_t, unsigned int);
 void ide_kill_rq(ide_drive_t *, struct request *);
+void ide_insert_request_head(ide_drive_t *, struct request *);
 
 void __ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
 void ide_set_handler(ide_drive_t *, ide_handler_t *, unsigned int);
@@ -1208,7 +1217,7 @@ extern void ide_stall_queue(ide_drive_t *drive, unsigned long timeout);
 
 extern void ide_timer_expiry(struct timer_list *t);
 extern irqreturn_t ide_intr(int irq, void *dev_id);
-extern void do_ide_request(struct request_queue *);
+extern blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
 extern void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq);
 
 void ide_init_disk(struct gendisk *, ide_drive_t *);

From 7ac257b862f2cfba3a909d1051499d390cffad6c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Oct 2018 17:03:21 -0600
Subject: [PATCH 010/351] blk-mq: remove the request_list usage

We don't do anything with it, that's just the legacy path.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3f91c6e5b17a..4c82dc44d4d8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -510,9 +510,6 @@ void blk_mq_free_request(struct request *rq)
 
 	rq_qos_done(q, rq);
 
-	if (blk_rq_rl(rq))
-		blk_put_rl(blk_rq_rl(rq));
-
 	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
 	if (refcount_dec_and_test(&rq->ref))
 		__blk_mq_free_request(rq);
@@ -1675,8 +1672,6 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	blk_init_request_from_bio(rq, bio);
 
-	blk_rq_set_rl(rq, blk_get_rl(rq->q, bio));
-
 	blk_account_io_start(rq, true);
 }
 

From ba7b443422d66b765c1aafe67b050be7dad7c676 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Oct 2018 18:06:02 -0600
Subject: [PATCH 011/351] blk-mq: remove legacy check in queue
 blk_freeze_queue()

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c82dc44d4d8..a58d2d953876 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -177,8 +177,6 @@ void blk_freeze_queue(struct request_queue *q)
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
 	blk_freeze_queue_start(q);
-	if (!q->mq_ops)
-		blk_drain_queue(q);
 	blk_mq_freeze_queue_wait(q);
 }
 

From 9ba20527f4d1430b5f3e5f566be5af3e156a3284 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:15:10 -0600
Subject: [PATCH 012/351] blk-mq: provide mq_ops->busy() hook

We'll hook into this from blk_lld_busy(), allowing blk-mq to also
return whether or not a given queue currently has requests in
progress.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 2 ++
 include/linux/blk-mq.h | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index ce12515f9b9b..ca1a3af49f87 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3431,6 +3431,8 @@ int blk_lld_busy(struct request_queue *q)
 {
 	if (q->lld_busy_fn)
 		return q->lld_busy_fn(q);
+	if (q->mq_ops && q->mq_ops->busy)
+		return q->mq_ops->busy(q);
 
 	return 0;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2286dc12c6bc..5c8418ebbfd6 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -114,6 +114,7 @@ typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
+typedef bool (busy_fn)(struct request_queue *);
 
 
 struct blk_mq_ops {
@@ -165,6 +166,11 @@ struct blk_mq_ops {
 	/* Called from inside blk_get_request() */
 	void (*initialize_rq_fn)(struct request *rq);
 
+	/*
+	 * If set, returns whether or not this queue currently is busy
+	 */
+	busy_fn			*busy;
+
 	map_queues_fn		*map_queues;
 
 #ifdef CONFIG_BLK_DEBUG_FS

From 3a7ea2c483a53fc89e336f69c6ee1d7defe00811 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:17:28 -0600
Subject: [PATCH 013/351] scsi: provide mq_ops->busy() hook

Only the SCSI legacy path provides a way to check if target is
currently busy, provide the same for the MQ path.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index c7fccbb8f554..8b0345924a92 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1675,6 +1675,11 @@ static int scsi_lld_busy(struct request_queue *q)
 	return 0;
 }
 
+static bool scsi_mq_lld_busy(struct request_queue *q)
+{
+	return scsi_lld_busy(q);
+}
+
 /*
  * Kill a request for a dead device
  */
@@ -2326,6 +2331,7 @@ static const struct blk_mq_ops scsi_mq_ops = {
 	.init_request	= scsi_mq_init_request,
 	.exit_request	= scsi_mq_exit_request,
 	.initialize_rq_fn = scsi_initialize_rq,
+	.busy		= scsi_mq_lld_busy,
 	.map_queues	= scsi_map_queues,
 };
 

From f664a3cc17b7d0a2bc3b3ab96181e1029b0ec0e6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 1 Nov 2018 16:36:27 -0600
Subject: [PATCH 014/351] scsi: kill off the legacy IO path

This removes the legacy (non-mq) IO path for SCSI.

Cc: linux-scsi@vger.kernel.org
Acked-by: Himanshu Madhani <himanshu.madhani@cavium.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/scsi/scsi-parameters.txt |   5 -
 drivers/scsi/Kconfig                   |  12 -
 drivers/scsi/cxlflash/main.c           |   6 -
 drivers/scsi/hosts.c                   |  29 +-
 drivers/scsi/lpfc/lpfc_scsi.c          |   2 +-
 drivers/scsi/qedi/qedi_main.c          |   3 +-
 drivers/scsi/qla2xxx/qla_os.c          |  30 +-
 drivers/scsi/scsi.c                    |   5 +-
 drivers/scsi/scsi_debug.c              |   3 +-
 drivers/scsi/scsi_error.c              |   2 +-
 drivers/scsi/scsi_lib.c                | 603 ++-----------------------
 drivers/scsi/scsi_priv.h               |   1 -
 drivers/scsi/scsi_scan.c               |  10 +-
 drivers/scsi/scsi_sysfs.c              |   8 +-
 drivers/scsi/ufs/ufshcd.c              |   6 -
 include/scsi/scsi_host.h               |  18 +-
 include/scsi/scsi_tcq.h                |  14 +-
 17 files changed, 77 insertions(+), 680 deletions(-)

diff --git a/Documentation/scsi/scsi-parameters.txt b/Documentation/scsi/scsi-parameters.txt
index 92999d4e0cb8..25a4b4cf04a6 100644
--- a/Documentation/scsi/scsi-parameters.txt
+++ b/Documentation/scsi/scsi-parameters.txt
@@ -97,11 +97,6 @@ parameters may be changed at runtime by the command
 			allowing boot to proceed.  none ignores them, expecting
 			user space to do the scan.
 
-	scsi_mod.use_blk_mq=
-			[SCSI] use blk-mq I/O path by default
-			See SCSI_MQ_DEFAULT in drivers/scsi/Kconfig.
-			Format: <y/n>
-
 	sim710=		[SCSI,HW]
 			See header of drivers/scsi/sim710.c.
 
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index f07444d30b21..dfdc6940de2f 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -50,18 +50,6 @@ config SCSI_NETLINK
 	default	n
 	depends on NET
 
-config SCSI_MQ_DEFAULT
-	bool "SCSI: use blk-mq I/O path by default"
-	default y
-	depends on SCSI
-	---help---
-	  This option enables the blk-mq based I/O path for SCSI devices by
-	  default.  With this option the scsi_mod.use_blk_mq module/boot
-	  option defaults to Y, without it to N, but it can still be
-	  overridden either way.
-
-	  If unsure say Y.
-
 config SCSI_PROC_FS
 	bool "legacy /proc/scsi/ support"
 	depends on SCSI && PROC_FS
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index 6637116529aa..abdc9eac4173 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -3088,12 +3088,6 @@ static ssize_t hwq_mode_store(struct device *dev,
 		return -EINVAL;
 	}
 
-	if ((mode == HWQ_MODE_TAG) && !shost_use_blk_mq(shost)) {
-		dev_info(cfgdev, "SCSI-MQ is not enabled, use a different "
-			 "HWQ steering mode.\n");
-		return -EINVAL;
-	}
-
 	afu->hwq_mode = mode;
 
 	return count;
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ea4b0bb0c1cd..cc71136ba300 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -222,18 +222,9 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	if (error)
 		goto fail;
 
-	if (shost_use_blk_mq(shost)) {
-		error = scsi_mq_setup_tags(shost);
-		if (error)
-			goto fail;
-	} else {
-		shost->bqt = blk_init_tags(shost->can_queue,
-				shost->hostt->tag_alloc_policy);
-		if (!shost->bqt) {
-			error = -ENOMEM;
-			goto fail;
-		}
-	}
+	error = scsi_mq_setup_tags(shost);
+	if (error)
+		goto fail;
 
 	if (!shost->shost_gendev.parent)
 		shost->shost_gendev.parent = dev ? dev : &platform_bus;
@@ -309,8 +300,7 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
 	pm_runtime_disable(&shost->shost_gendev);
 	pm_runtime_set_suspended(&shost->shost_gendev);
 	pm_runtime_put_noidle(&shost->shost_gendev);
-	if (shost_use_blk_mq(shost))
-		scsi_mq_destroy_tags(shost);
+	scsi_mq_destroy_tags(shost);
  fail:
 	return error;
 }
@@ -344,13 +334,8 @@ static void scsi_host_dev_release(struct device *dev)
 		kfree(dev_name(&shost->shost_dev));
 	}
 
-	if (shost_use_blk_mq(shost)) {
-		if (shost->tag_set.tags)
-			scsi_mq_destroy_tags(shost);
-	} else {
-		if (shost->bqt)
-			blk_free_tags(shost->bqt);
-	}
+	if (shost->tag_set.tags)
+		scsi_mq_destroy_tags(shost);
 
 	kfree(shost->shost_data);
 
@@ -472,8 +457,6 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	else
 		shost->dma_boundary = 0xffffffff;
 
-	shost->use_blk_mq = scsi_use_blk_mq || shost->hostt->force_blk_mq;
-
 	device_initialize(&shost->shost_gendev);
 	dev_set_name(&shost->shost_gendev, "host%d", shost->host_no);
 	shost->shost_gendev.bus = &scsi_bus_type;
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 4fa6703a9ec9..baed2b891efb 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -3914,7 +3914,7 @@ int lpfc_sli4_scmd_to_wqidx_distr(struct lpfc_hba *phba,
 	uint32_t tag;
 	uint16_t hwq;
 
-	if (cmnd && shost_use_blk_mq(cmnd->device->host)) {
+	if (cmnd) {
 		tag = blk_mq_unique_tag(cmnd->request);
 		hwq = blk_mq_unique_tag_to_hwq(tag);
 
diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c
index 105b0e4d7818..311eb22068e1 100644
--- a/drivers/scsi/qedi/qedi_main.c
+++ b/drivers/scsi/qedi/qedi_main.c
@@ -644,8 +644,7 @@ static struct qedi_ctx *qedi_host_alloc(struct pci_dev *pdev)
 	qedi->max_active_conns = ISCSI_MAX_SESS_PER_HBA;
 	qedi->max_sqes = QEDI_SQ_SIZE;
 
-	if (shost_use_blk_mq(shost))
-		shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
+	shost->nr_hw_queues = MIN_NUM_CPUS_MSIX(qedi);
 
 	pci_set_drvdata(pdev, qedi);
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 518f15141170..4ea9f2b4e04f 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -857,13 +857,9 @@ qla2xxx_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmd)
 	}
 
 	if (ha->mqenable) {
-		if (shost_use_blk_mq(vha->host)) {
-			tag = blk_mq_unique_tag(cmd->request);
-			hwq = blk_mq_unique_tag_to_hwq(tag);
-			qpair = ha->queue_pair_map[hwq];
-		} else if (vha->vp_idx && vha->qpair) {
-			qpair = vha->qpair;
-		}
+		tag = blk_mq_unique_tag(cmd->request);
+		hwq = blk_mq_unique_tag_to_hwq(tag);
+		qpair = ha->queue_pair_map[hwq];
 
 		if (qpair)
 			return qla2xxx_mqueuecommand(host, cmd, qpair);
@@ -3153,7 +3149,7 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 		goto probe_failed;
 	}
 
-	if (ha->mqenable && shost_use_blk_mq(host)) {
+	if (ha->mqenable) {
 		/* number of hardware queues supported by blk/scsi-mq*/
 		host->nr_hw_queues = ha->max_qpairs;
 
@@ -3265,25 +3261,17 @@ qla2x00_probe_one(struct pci_dev *pdev, const struct pci_device_id *id)
 	    base_vha->mgmt_svr_loop_id, host->sg_tablesize);
 
 	if (ha->mqenable) {
-		bool mq = false;
 		bool startit = false;
 
-		if (QLA_TGT_MODE_ENABLED()) {
-			mq = true;
+		if (QLA_TGT_MODE_ENABLED())
 			startit = false;
-		}
 
-		if ((ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED) &&
-		    shost_use_blk_mq(host)) {
-			mq = true;
+		if (ql2x_ini_mode == QLA2XXX_INI_MODE_ENABLED)
 			startit = true;
-		}
 
-		if (mq) {
-			/* Create start of day qpairs for Block MQ */
-			for (i = 0; i < ha->max_qpairs; i++)
-				qla2xxx_create_qpair(base_vha, 5, 0, startit);
-		}
+		/* Create start of day qpairs for Block MQ */
+		for (i = 0; i < ha->max_qpairs; i++)
+			qla2xxx_create_qpair(base_vha, 5, 0, startit);
 	}
 
 	if (ha->flags.running_gold_fw)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index fc1356d101b0..7675ff0ca2ea 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -780,11 +780,8 @@ MODULE_LICENSE("GPL");
 module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR);
 MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels");
 
-#ifdef CONFIG_SCSI_MQ_DEFAULT
+/* This should go away in the future, it doesn't do anything anymore */
 bool scsi_use_blk_mq = true;
-#else
-bool scsi_use_blk_mq = false;
-#endif
 module_param_named(use_blk_mq, scsi_use_blk_mq, bool, S_IWUSR | S_IRUGO);
 
 static int __init init_scsi(void)
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index 60bcc6df97a9..4740f1e9dd17 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -5881,8 +5881,7 @@ static int sdebug_driver_probe(struct device *dev)
 	}
 	/* Decide whether to tell scsi subsystem that we want mq */
 	/* Following should give the same answer for each host */
-	if (shost_use_blk_mq(hpnt))
-		hpnt->nr_hw_queues = submit_queues;
+	hpnt->nr_hw_queues = submit_queues;
 
 	sdbg_host->shost = hpnt;
 	*((struct sdebug_host_info **)hpnt->hostdata) = sdbg_host;
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index c736d61b1648..fff128aa9ec2 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -308,7 +308,7 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 		 * error handler. In that case we can return immediately as no
 		 * further action is required.
 		 */
-		if (req->q->mq_ops && !blk_mq_mark_complete(req))
+		if (!blk_mq_mark_complete(req))
 			return rtn;
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8b0345924a92..651be30ba96a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -168,8 +168,6 @@ static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd)
 static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
 {
 	struct scsi_device *device = cmd->device;
-	struct request_queue *q = device->request_queue;
-	unsigned long flags;
 
 	SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd,
 		"Inserting command %p into mlqueue\n", cmd));
@@ -190,26 +188,20 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy)
 	 * before blk_cleanup_queue() finishes.
 	 */
 	cmd->result = 0;
-	if (q->mq_ops) {
-		/*
-		 * Before a SCSI command is dispatched,
-		 * get_device(&sdev->sdev_gendev) is called and the host,
-		 * target and device busy counters are increased. Since
-		 * requeuing a request causes these actions to be repeated and
-		 * since scsi_device_unbusy() has already been called,
-		 * put_device(&device->sdev_gendev) must still be called. Call
-		 * put_device() after blk_mq_requeue_request() to avoid that
-		 * removal of the SCSI device can start before requeueing has
-		 * happened.
-		 */
-		blk_mq_requeue_request(cmd->request, true);
-		put_device(&device->sdev_gendev);
-		return;
-	}
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_requeue_request(q, cmd->request);
-	kblockd_schedule_work(&device->requeue_work);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/*
+	 * Before a SCSI command is dispatched,
+	 * get_device(&sdev->sdev_gendev) is called and the host,
+	 * target and device busy counters are increased. Since
+	 * requeuing a request causes these actions to be repeated and
+	 * since scsi_device_unbusy() has already been called,
+	 * put_device(&device->sdev_gendev) must still be called. Call
+	 * put_device() after blk_mq_requeue_request() to avoid that
+	 * removal of the SCSI device can start before requeueing has
+	 * happened.
+	 */
+	blk_mq_requeue_request(cmd->request, true);
+	put_device(&device->sdev_gendev);
 }
 
 /*
@@ -370,10 +362,7 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 
 static void scsi_kick_queue(struct request_queue *q)
 {
-	if (q->mq_ops)
-		blk_mq_run_hw_queues(q, false);
-	else
-		blk_run_queue(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
 /*
@@ -534,10 +523,7 @@ static void scsi_run_queue(struct request_queue *q)
 	if (!list_empty(&sdev->host->starved_list))
 		scsi_starved_list_run(sdev->host);
 
-	if (q->mq_ops)
-		blk_mq_run_hw_queues(q, false);
-	else
-		blk_run_queue(q);
+	blk_mq_run_hw_queues(q, false);
 }
 
 void scsi_requeue_run_queue(struct work_struct *work)
@@ -550,42 +536,6 @@ void scsi_requeue_run_queue(struct work_struct *work)
 	scsi_run_queue(q);
 }
 
-/*
- * Function:	scsi_requeue_command()
- *
- * Purpose:	Handle post-processing of completed commands.
- *
- * Arguments:	q	- queue to operate on
- *		cmd	- command that may need to be requeued.
- *
- * Returns:	Nothing
- *
- * Notes:	After command completion, there may be blocks left
- *		over which weren't finished by the previous command
- *		this can be for a number of reasons - the main one is
- *		I/O errors in the middle of the request, in which case
- *		we need to request the blocks that come after the bad
- *		sector.
- * Notes:	Upon return, cmd is a stale pointer.
- */
-static void scsi_requeue_command(struct request_queue *q, struct scsi_cmnd *cmd)
-{
-	struct scsi_device *sdev = cmd->device;
-	struct request *req = cmd->request;
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_unprep_request(req);
-	req->special = NULL;
-	scsi_put_command(cmd);
-	blk_requeue_request(q, req);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	scsi_run_queue(q);
-
-	put_device(&sdev->sdev_gendev);
-}
-
 void scsi_run_host_queues(struct Scsi_Host *shost)
 {
 	struct scsi_device *sdev;
@@ -626,42 +576,6 @@ static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd)
 	scsi_del_cmd_from_list(cmd);
 }
 
-/*
- * Function:    scsi_release_buffers()
- *
- * Purpose:     Free resources allocate for a scsi_command.
- *
- * Arguments:   cmd	- command that we are bailing.
- *
- * Lock status: Assumed that no lock is held upon entry.
- *
- * Returns:     Nothing
- *
- * Notes:       In the event that an upper level driver rejects a
- *		command, we must release resources allocated during
- *		the __init_io() function.  Primarily this would involve
- *		the scatter-gather table.
- */
-static void scsi_release_buffers(struct scsi_cmnd *cmd)
-{
-	if (cmd->sdb.table.nents)
-		sg_free_table_chained(&cmd->sdb.table, false);
-
-	memset(&cmd->sdb, 0, sizeof(cmd->sdb));
-
-	if (scsi_prot_sg_count(cmd))
-		sg_free_table_chained(&cmd->prot_sdb->table, false);
-}
-
-static void scsi_release_bidi_buffers(struct scsi_cmnd *cmd)
-{
-	struct scsi_data_buffer *bidi_sdb = cmd->request->next_rq->special;
-
-	sg_free_table_chained(&bidi_sdb->table, false);
-	kmem_cache_free(scsi_sdb_cache, bidi_sdb);
-	cmd->request->next_rq->special = NULL;
-}
-
 /* Returns false when no more bytes to process, true if there are more */
 static bool scsi_end_request(struct request *req, blk_status_t error,
 		unsigned int bytes, unsigned int bidi_bytes)
@@ -687,37 +601,22 @@ static bool scsi_end_request(struct request *req, blk_status_t error,
 		destroy_rcu_head(&cmd->rcu);
 	}
 
-	if (req->mq_ctx) {
-		/*
-		 * In the MQ case the command gets freed by __blk_mq_end_request,
-		 * so we have to do all cleanup that depends on it earlier.
-		 *
-		 * We also can't kick the queues from irq context, so we
-		 * will have to defer it to a workqueue.
-		 */
-		scsi_mq_uninit_cmd(cmd);
+	/*
+	 * In the MQ case the command gets freed by __blk_mq_end_request,
+	 * so we have to do all cleanup that depends on it earlier.
+	 *
+	 * We also can't kick the queues from irq context, so we
+	 * will have to defer it to a workqueue.
+	 */
+	scsi_mq_uninit_cmd(cmd);
 
-		__blk_mq_end_request(req, error);
+	__blk_mq_end_request(req, error);
 
-		if (scsi_target(sdev)->single_lun ||
-		    !list_empty(&sdev->host->starved_list))
-			kblockd_schedule_work(&sdev->requeue_work);
-		else
-			blk_mq_run_hw_queues(q, true);
-	} else {
-		unsigned long flags;
-
-		if (bidi_bytes)
-			scsi_release_bidi_buffers(cmd);
-		scsi_release_buffers(cmd);
-		scsi_put_command(cmd);
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_finish_request(req, error);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
-		scsi_run_queue(q);
-	}
+	if (scsi_target(sdev)->single_lun ||
+	    !list_empty(&sdev->host->starved_list))
+		kblockd_schedule_work(&sdev->requeue_work);
+	else
+		blk_mq_run_hw_queues(q, true);
 
 	put_device(&sdev->sdev_gendev);
 	return false;
@@ -766,13 +665,7 @@ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd,
 				      struct request_queue *q)
 {
 	/* A new command will be prepared and issued. */
-	if (q->mq_ops) {
-		scsi_mq_requeue_cmd(cmd);
-	} else {
-		/* Unprep request and put it back at head of the queue. */
-		scsi_release_buffers(cmd);
-		scsi_requeue_command(q, cmd);
-	}
+	scsi_mq_requeue_cmd(cmd);
 }
 
 /* Helper for scsi_io_completion() when special action required. */
@@ -1147,9 +1040,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
  */
 int scsi_init_io(struct scsi_cmnd *cmd)
 {
-	struct scsi_device *sdev = cmd->device;
 	struct request *rq = cmd->request;
-	bool is_mq = (rq->mq_ctx != NULL);
 	int error = BLKPREP_KILL;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
@@ -1160,17 +1051,6 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		goto err_exit;
 
 	if (blk_bidi_rq(rq)) {
-		if (!rq->q->mq_ops) {
-			struct scsi_data_buffer *bidi_sdb =
-				kmem_cache_zalloc(scsi_sdb_cache, GFP_ATOMIC);
-			if (!bidi_sdb) {
-				error = BLKPREP_DEFER;
-				goto err_exit;
-			}
-
-			rq->next_rq->special = bidi_sdb;
-		}
-
 		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
 		if (error)
 			goto err_exit;
@@ -1210,14 +1090,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 
 	return BLKPREP_OK;
 err_exit:
-	if (is_mq) {
-		scsi_mq_free_sgtables(cmd);
-	} else {
-		scsi_release_buffers(cmd);
-		cmd->request->special = NULL;
-		scsi_put_command(cmd);
-		put_device(&sdev->sdev_gendev);
-	}
+	scsi_mq_free_sgtables(cmd);
 	return error;
 }
 EXPORT_SYMBOL(scsi_init_io);
@@ -1423,75 +1296,6 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 	return ret;
 }
 
-static int
-scsi_prep_return(struct request_queue *q, struct request *req, int ret)
-{
-	struct scsi_device *sdev = q->queuedata;
-
-	switch (ret) {
-	case BLKPREP_KILL:
-	case BLKPREP_INVALID:
-		scsi_req(req)->result = DID_NO_CONNECT << 16;
-		/* release the command and kill it */
-		if (req->special) {
-			struct scsi_cmnd *cmd = req->special;
-			scsi_release_buffers(cmd);
-			scsi_put_command(cmd);
-			put_device(&sdev->sdev_gendev);
-			req->special = NULL;
-		}
-		break;
-	case BLKPREP_DEFER:
-		/*
-		 * If we defer, the blk_peek_request() returns NULL, but the
-		 * queue must be restarted, so we schedule a callback to happen
-		 * shortly.
-		 */
-		if (atomic_read(&sdev->device_busy) == 0)
-			blk_delay_queue(q, SCSI_QUEUE_DELAY);
-		break;
-	default:
-		req->rq_flags |= RQF_DONTPREP;
-	}
-
-	return ret;
-}
-
-static int scsi_prep_fn(struct request_queue *q, struct request *req)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-	int ret;
-
-	ret = scsi_prep_state_check(sdev, req);
-	if (ret != BLKPREP_OK)
-		goto out;
-
-	if (!req->special) {
-		/* Bail if we can't get a reference to the device */
-		if (unlikely(!get_device(&sdev->sdev_gendev))) {
-			ret = BLKPREP_DEFER;
-			goto out;
-		}
-
-		scsi_init_command(sdev, cmd);
-		req->special = cmd;
-	}
-
-	cmd->tag = req->tag;
-	cmd->request = req;
-	cmd->prot_op = SCSI_PROT_NORMAL;
-
-	ret = scsi_setup_cmnd(sdev, req);
-out:
-	return scsi_prep_return(q, req, ret);
-}
-
-static void scsi_unprep_fn(struct request_queue *q, struct request *req)
-{
-	scsi_uninit_cmd(blk_mq_rq_to_pdu(req));
-}
-
 /*
  * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else
  * return 0.
@@ -1511,14 +1315,8 @@ static inline int scsi_dev_queue_ready(struct request_queue *q,
 		/*
 		 * unblock after device_blocked iterates to zero
 		 */
-		if (atomic_dec_return(&sdev->device_blocked) > 0) {
-			/*
-			 * For the MQ case we take care of this in the caller.
-			 */
-			if (!q->mq_ops)
-				blk_delay_queue(q, SCSI_QUEUE_DELAY);
+		if (atomic_dec_return(&sdev->device_blocked) > 0)
 			goto out_dec;
-		}
 		SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev,
 				   "unblocking device at zero depth\n"));
 	}
@@ -1653,13 +1451,13 @@ out_dec:
  * needs to return 'not busy'. Otherwise, request stacking drivers
  * may hold requests forever.
  */
-static int scsi_lld_busy(struct request_queue *q)
+static bool scsi_mq_lld_busy(struct request_queue *q)
 {
 	struct scsi_device *sdev = q->queuedata;
 	struct Scsi_Host *shost;
 
 	if (blk_queue_dying(q))
-		return 0;
+		return false;
 
 	shost = sdev->host;
 
@@ -1670,48 +1468,9 @@ static int scsi_lld_busy(struct request_queue *q)
 	 * in SCSI layer.
 	 */
 	if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev))
-		return 1;
+		return true;
 
-	return 0;
-}
-
-static bool scsi_mq_lld_busy(struct request_queue *q)
-{
-	return scsi_lld_busy(q);
-}
-
-/*
- * Kill a request for a dead device
- */
-static void scsi_kill_request(struct request *req, struct request_queue *q)
-{
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
-	struct scsi_device *sdev;
-	struct scsi_target *starget;
-	struct Scsi_Host *shost;
-
-	blk_start_request(req);
-
-	scmd_printk(KERN_INFO, cmd, "killing request\n");
-
-	sdev = cmd->device;
-	starget = scsi_target(sdev);
-	shost = sdev->host;
-	scsi_init_cmd_errh(cmd);
-	cmd->result = DID_NO_CONNECT << 16;
-	atomic_inc(&cmd->device->iorequest_cnt);
-
-	/*
-	 * SCSI request completion path will do scsi_device_unbusy(),
-	 * bump busy counts.  To bump the counters, we need to dance
-	 * with the locks as normal issue path does.
-	 */
-	atomic_inc(&sdev->device_busy);
-	atomic_inc(&shost->host_busy);
-	if (starget->can_queue > 0)
-		atomic_inc(&starget->target_busy);
-
-	blk_complete_request(req);
+	return false;
 }
 
 static void scsi_softirq_done(struct request *rq)
@@ -1834,158 +1593,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return 0;
 }
 
-/**
- * scsi_done - Invoke completion on finished SCSI command.
- * @cmd: The SCSI Command for which a low-level device driver (LLDD) gives
- * ownership back to SCSI Core -- i.e. the LLDD has finished with it.
- *
- * Description: This function is the mid-level's (SCSI Core) interrupt routine,
- * which regains ownership of the SCSI command (de facto) from a LLDD, and
- * calls blk_complete_request() for further processing.
- *
- * This function is interrupt context safe.
- */
-static void scsi_done(struct scsi_cmnd *cmd)
-{
-	trace_scsi_dispatch_cmd_done(cmd);
-	blk_complete_request(cmd->request);
-}
-
-/*
- * Function:    scsi_request_fn()
- *
- * Purpose:     Main strategy routine for SCSI.
- *
- * Arguments:   q       - Pointer to actual queue.
- *
- * Returns:     Nothing
- *
- * Lock status: request queue lock assumed to be held when called.
- *
- * Note: See sd_zbc.c sd_zbc_write_lock_zone() for write order
- * protection for ZBC disks.
- */
-static void scsi_request_fn(struct request_queue *q)
-	__releases(q->queue_lock)
-	__acquires(q->queue_lock)
-{
-	struct scsi_device *sdev = q->queuedata;
-	struct Scsi_Host *shost;
-	struct scsi_cmnd *cmd;
-	struct request *req;
-
-	/*
-	 * To start with, we keep looping until the queue is empty, or until
-	 * the host is no longer able to accept any more requests.
-	 */
-	shost = sdev->host;
-	for (;;) {
-		int rtn;
-		/*
-		 * get next queueable request.  We do this early to make sure
-		 * that the request is fully prepared even if we cannot
-		 * accept it.
-		 */
-		req = blk_peek_request(q);
-		if (!req)
-			break;
-
-		if (unlikely(!scsi_device_online(sdev))) {
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to offline device\n");
-			scsi_kill_request(req, q);
-			continue;
-		}
-
-		if (!scsi_dev_queue_ready(q, sdev))
-			break;
-
-		/*
-		 * Remove the request from the request list.
-		 */
-		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
-			blk_start_request(req);
-
-		spin_unlock_irq(q->queue_lock);
-		cmd = blk_mq_rq_to_pdu(req);
-		if (cmd != req->special) {
-			printk(KERN_CRIT "impossible request in %s.\n"
-					 "please mail a stack trace to "
-					 "linux-scsi@vger.kernel.org\n",
-					 __func__);
-			blk_dump_rq_flags(req, "foo");
-			BUG();
-		}
-
-		/*
-		 * We hit this when the driver is using a host wide
-		 * tag map. For device level tag maps the queue_depth check
-		 * in the device ready fn would prevent us from trying
-		 * to allocate a tag. Since the map is a shared host resource
-		 * we add the dev to the starved list so it eventually gets
-		 * a run when a tag is freed.
-		 */
-		if (blk_queue_tagged(q) && !(req->rq_flags & RQF_QUEUED)) {
-			spin_lock_irq(shost->host_lock);
-			if (list_empty(&sdev->starved_entry))
-				list_add_tail(&sdev->starved_entry,
-					      &shost->starved_list);
-			spin_unlock_irq(shost->host_lock);
-			goto not_ready;
-		}
-
-		if (!scsi_target_queue_ready(shost, sdev))
-			goto not_ready;
-
-		if (!scsi_host_queue_ready(q, shost, sdev))
-			goto host_not_ready;
-	
-		if (sdev->simple_tags)
-			cmd->flags |= SCMD_TAGGED;
-		else
-			cmd->flags &= ~SCMD_TAGGED;
-
-		/*
-		 * Finally, initialize any error handling parameters, and set up
-		 * the timers for timeouts.
-		 */
-		scsi_init_cmd_errh(cmd);
-
-		/*
-		 * Dispatch the command to the low-level driver.
-		 */
-		cmd->scsi_done = scsi_done;
-		rtn = scsi_dispatch_cmd(cmd);
-		if (rtn) {
-			scsi_queue_insert(cmd, rtn);
-			spin_lock_irq(q->queue_lock);
-			goto out_delay;
-		}
-		spin_lock_irq(q->queue_lock);
-	}
-
-	return;
-
- host_not_ready:
-	if (scsi_target(sdev)->can_queue > 0)
-		atomic_dec(&scsi_target(sdev)->target_busy);
- not_ready:
-	/*
-	 * lock q, handle tag, requeue req, and decrement device_busy. We
-	 * must return with queue_lock held.
-	 *
-	 * Decrementing device_busy without checking it is OK, as all such
-	 * cases (host limits or settings) should run the queue at some
-	 * later time.
-	 */
-	spin_lock_irq(q->queue_lock);
-	blk_requeue_request(q, req);
-	atomic_dec(&sdev->device_busy);
-out_delay:
-	if (!atomic_read(&sdev->device_busy) && !scsi_device_blocked(sdev))
-		blk_delay_queue(q, SCSI_QUEUE_DELAY);
-}
-
 static inline blk_status_t prep_to_mq(int ret)
 {
 	switch (ret) {
@@ -2248,77 +1855,6 @@ void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(__scsi_init_queue);
 
-static int scsi_old_init_rq(struct request_queue *q, struct request *rq,
-			    gfp_t gfp)
-{
-	struct Scsi_Host *shost = q->rq_alloc_data;
-	const bool unchecked_isa_dma = shost->unchecked_isa_dma;
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
-	memset(cmd, 0, sizeof(*cmd));
-
-	if (unchecked_isa_dma)
-		cmd->flags |= SCMD_UNCHECKED_ISA_DMA;
-	cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, gfp,
-						    NUMA_NO_NODE);
-	if (!cmd->sense_buffer)
-		goto fail;
-	cmd->req.sense = cmd->sense_buffer;
-
-	if (scsi_host_get_prot(shost) >= SHOST_DIX_TYPE0_PROTECTION) {
-		cmd->prot_sdb = kmem_cache_zalloc(scsi_sdb_cache, gfp);
-		if (!cmd->prot_sdb)
-			goto fail_free_sense;
-	}
-
-	return 0;
-
-fail_free_sense:
-	scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer);
-fail:
-	return -ENOMEM;
-}
-
-static void scsi_old_exit_rq(struct request_queue *q, struct request *rq)
-{
-	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq);
-
-	if (cmd->prot_sdb)
-		kmem_cache_free(scsi_sdb_cache, cmd->prot_sdb);
-	scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA,
-			       cmd->sense_buffer);
-}
-
-struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev)
-{
-	struct Scsi_Host *shost = sdev->host;
-	struct request_queue *q;
-
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
-	if (!q)
-		return NULL;
-	q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
-	q->rq_alloc_data = shost;
-	q->request_fn = scsi_request_fn;
-	q->init_rq_fn = scsi_old_init_rq;
-	q->exit_rq_fn = scsi_old_exit_rq;
-	q->initialize_rq_fn = scsi_initialize_rq;
-
-	if (blk_init_allocated_queue(q) < 0) {
-		blk_cleanup_queue(q);
-		return NULL;
-	}
-
-	__scsi_init_queue(shost, q);
-	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, q);
-	blk_queue_prep_rq(q, scsi_prep_fn);
-	blk_queue_unprep_rq(q, scsi_unprep_fn);
-	blk_queue_softirq_done(q, scsi_softirq_done);
-	blk_queue_rq_timed_out(q, scsi_times_out);
-	blk_queue_lld_busy(q, scsi_lld_busy);
-	return q;
-}
-
 static const struct blk_mq_ops scsi_mq_ops = {
 	.get_budget	= scsi_mq_get_budget,
 	.put_budget	= scsi_mq_put_budget,
@@ -2386,10 +1922,7 @@ struct scsi_device *scsi_device_from_queue(struct request_queue *q)
 {
 	struct scsi_device *sdev = NULL;
 
-	if (q->mq_ops) {
-		if (q->mq_ops == &scsi_mq_ops)
-			sdev = q->queuedata;
-	} else if (q->request_fn == scsi_request_fn)
+	if (q->mq_ops == &scsi_mq_ops)
 		sdev = q->queuedata;
 	if (!sdev || !get_device(&sdev->sdev_gendev))
 		sdev = NULL;
@@ -2992,39 +2525,6 @@ void sdev_evt_send_simple(struct scsi_device *sdev,
 }
 EXPORT_SYMBOL_GPL(sdev_evt_send_simple);
 
-/**
- * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn()
- * @sdev: SCSI device to count the number of scsi_request_fn() callers for.
- */
-static int scsi_request_fn_active(struct scsi_device *sdev)
-{
-	struct request_queue *q = sdev->request_queue;
-	int request_fn_active;
-
-	WARN_ON_ONCE(sdev->host->use_blk_mq);
-
-	spin_lock_irq(q->queue_lock);
-	request_fn_active = q->request_fn_active;
-	spin_unlock_irq(q->queue_lock);
-
-	return request_fn_active;
-}
-
-/**
- * scsi_wait_for_queuecommand() - wait for ongoing queuecommand() calls
- * @sdev: SCSI device pointer.
- *
- * Wait until the ongoing shost->hostt->queuecommand() calls that are
- * invoked from scsi_request_fn() have finished.
- */
-static void scsi_wait_for_queuecommand(struct scsi_device *sdev)
-{
-	WARN_ON_ONCE(sdev->host->use_blk_mq);
-
-	while (scsi_request_fn_active(sdev))
-		msleep(20);
-}
-
 /**
  *	scsi_device_quiesce - Block user issued commands.
  *	@sdev:	scsi device to quiesce.
@@ -3148,7 +2648,6 @@ EXPORT_SYMBOL(scsi_target_resume);
 int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 {
 	struct request_queue *q = sdev->request_queue;
-	unsigned long flags;
 	int err = 0;
 
 	err = scsi_device_set_state(sdev, SDEV_BLOCK);
@@ -3164,14 +2663,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	 * block layer from calling the midlayer with this device's
 	 * request queue. 
 	 */
-	if (q->mq_ops) {
-		blk_mq_quiesce_queue_nowait(q);
-	} else {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_stop_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-
+	blk_mq_quiesce_queue_nowait(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -3202,12 +2694,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 
 	mutex_lock(&sdev->state_mutex);
 	err = scsi_internal_device_block_nowait(sdev);
-	if (err == 0) {
-		if (q->mq_ops)
-			blk_mq_quiesce_queue(q);
-		else
-			scsi_wait_for_queuecommand(sdev);
-	}
+	if (err == 0)
+		blk_mq_quiesce_queue(q);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;
@@ -3216,15 +2704,8 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 void scsi_start_queue(struct scsi_device *sdev)
 {
 	struct request_queue *q = sdev->request_queue;
-	unsigned long flags;
 
-	if (q->mq_ops) {
-		blk_mq_unquiesce_queue(q);
-	} else {
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	blk_mq_unquiesce_queue(q);
 }
 
 /**
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index 99f1db5e467e..5f21547b2ad2 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -92,7 +92,6 @@ extern void scsi_queue_insert(struct scsi_cmnd *cmd, int reason);
 extern void scsi_io_completion(struct scsi_cmnd *, unsigned int);
 extern void scsi_run_host_queues(struct Scsi_Host *shost);
 extern void scsi_requeue_run_queue(struct work_struct *work);
-extern struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev);
 extern struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev);
 extern void scsi_start_queue(struct scsi_device *sdev);
 extern int scsi_mq_setup_tags(struct Scsi_Host *shost);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 78ca63dfba4a..dd0d516f65e2 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -266,10 +266,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	 */
 	sdev->borken = 1;
 
-	if (shost_use_blk_mq(shost))
-		sdev->request_queue = scsi_mq_alloc_queue(sdev);
-	else
-		sdev->request_queue = scsi_old_alloc_queue(sdev);
+	sdev->request_queue = scsi_mq_alloc_queue(sdev);
 	if (!sdev->request_queue) {
 		/* release fn is set up in scsi_sysfs_device_initialise, so
 		 * have to free and put manually here */
@@ -280,11 +277,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
 	WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
 	sdev->request_queue->queuedata = sdev;
 
-	if (!shost_use_blk_mq(sdev->host)) {
-		blk_queue_init_tags(sdev->request_queue,
-				    sdev->host->cmd_per_lun, shost->bqt,
-				    shost->hostt->tag_alloc_policy);
-	}
 	scsi_change_queue_depth(sdev, sdev->host->cmd_per_lun ?
 					sdev->host->cmd_per_lun : 1);
 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 3aee9464a7bf..6a9040faed00 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -367,7 +367,6 @@ store_shost_eh_deadline(struct device *dev, struct device_attribute *attr,
 
 static DEVICE_ATTR(eh_deadline, S_IRUGO | S_IWUSR, show_shost_eh_deadline, store_shost_eh_deadline);
 
-shost_rd_attr(use_blk_mq, "%d\n");
 shost_rd_attr(unique_id, "%u\n");
 shost_rd_attr(cmd_per_lun, "%hd\n");
 shost_rd_attr(can_queue, "%hd\n");
@@ -386,6 +385,13 @@ show_host_busy(struct device *dev, struct device_attribute *attr, char *buf)
 }
 static DEVICE_ATTR(host_busy, S_IRUGO, show_host_busy, NULL);
 
+static ssize_t
+show_use_blk_mq(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "1\n");
+}
+static DEVICE_ATTR(use_blk_mq, S_IRUGO, show_use_blk_mq, NULL);
+
 static struct attribute *scsi_sysfs_shost_attrs[] = {
 	&dev_attr_use_blk_mq.attr,
 	&dev_attr_unique_id.attr,
diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
index 23d7cca36ff0..fb308ea8e9a5 100644
--- a/drivers/scsi/ufs/ufshcd.c
+++ b/drivers/scsi/ufs/ufshcd.c
@@ -8100,12 +8100,6 @@ int ufshcd_alloc_host(struct device *dev, struct ufs_hba **hba_handle)
 		goto out_error;
 	}
 
-	/*
-	 * Do not use blk-mq at this time because blk-mq does not support
-	 * runtime pm.
-	 */
-	host->use_blk_mq = false;
-
 	hba = shost_priv(host);
 	hba->host = host;
 	hba->dev = dev;
diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h
index 5ea06d310a25..aa760df8c6b3 100644
--- a/include/scsi/scsi_host.h
+++ b/include/scsi/scsi_host.h
@@ -11,7 +11,6 @@
 #include <linux/blk-mq.h>
 #include <scsi/scsi.h>
 
-struct request_queue;
 struct block_device;
 struct completion;
 struct module;
@@ -22,7 +21,6 @@ struct scsi_target;
 struct Scsi_Host;
 struct scsi_host_cmd_pool;
 struct scsi_transport_template;
-struct blk_queue_tags;
 
 
 /*
@@ -547,14 +545,8 @@ struct Scsi_Host {
 	struct scsi_host_template *hostt;
 	struct scsi_transport_template *transportt;
 
-	/*
-	 * Area to keep a shared tag map (if needed, will be
-	 * NULL if not).
-	 */
-	union {
-		struct blk_queue_tag	*bqt;
-		struct blk_mq_tag_set	tag_set;
-	};
+	/* Area to keep a shared tag map */
+	struct blk_mq_tag_set	tag_set;
 
 	atomic_t host_busy;		   /* commands actually active on low-level */
 	atomic_t host_blocked;
@@ -648,7 +640,6 @@ struct Scsi_Host {
 	/* The controller does not support WRITE SAME */
 	unsigned no_write_same:1;
 
-	unsigned use_blk_mq:1;
 	unsigned use_cmd_list:1;
 
 	/* Host responded with short (<36 bytes) INQUIRY result */
@@ -742,11 +733,6 @@ static inline int scsi_host_in_recovery(struct Scsi_Host *shost)
 		shost->tmf_in_progress;
 }
 
-static inline bool shost_use_blk_mq(struct Scsi_Host *shost)
-{
-	return shost->use_blk_mq;
-}
-
 extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *);
 extern void scsi_flush_work(struct Scsi_Host *);
 
diff --git a/include/scsi/scsi_tcq.h b/include/scsi/scsi_tcq.h
index e192a0caa850..6053d46e794e 100644
--- a/include/scsi/scsi_tcq.h
+++ b/include/scsi/scsi_tcq.h
@@ -23,19 +23,15 @@ static inline struct scsi_cmnd *scsi_host_find_tag(struct Scsi_Host *shost,
 		int tag)
 {
 	struct request *req = NULL;
+	u16 hwq;
 
 	if (tag == SCSI_NO_TAG)
 		return NULL;
 
-	if (shost_use_blk_mq(shost)) {
-		u16 hwq = blk_mq_unique_tag_to_hwq(tag);
-
-		if (hwq < shost->tag_set.nr_hw_queues) {
-			req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq],
-				blk_mq_unique_tag_to_tag(tag));
-		}
-	} else {
-		req = blk_map_queue_find_tag(shost->bqt, tag);
+	hwq = blk_mq_unique_tag_to_hwq(tag);
+	if (hwq < shost->tag_set.nr_hw_queues) {
+		req = blk_mq_tag_to_rq(shost->tag_set.tags[hwq],
+					blk_mq_unique_tag_to_tag(tag));
 	}
 
 	if (!req)

From c6f2882691e8fd128083abdcc3c5aa5b410c2367 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:22:19 -0600
Subject: [PATCH 015/351] block: remove q->lld_busy_fn()

Nobody is using the legacy path for blk_lld_busy() anymore, remove
it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 2 --
 block/blk-settings.c   | 6 ------
 include/linux/blkdev.h | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ca1a3af49f87..03ef8f0e7dc5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3429,8 +3429,6 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
  */
 int blk_lld_busy(struct request_queue *q)
 {
-	if (q->lld_busy_fn)
-		return q->lld_busy_fn(q);
 	if (q->mq_ops && q->mq_ops->busy)
 		return q->mq_ops->busy(q);
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 696c04c1ab6c..ac8b8ba4b126 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -73,12 +73,6 @@ void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
 
-void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
-{
-	q->lld_busy_fn = fn;
-}
-EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
-
 /**
  * blk_set_default_limits - reset limits to default values
  * @lim:  the queue_limits structure to reset
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4293dc1cd160..e867733b761d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -320,7 +320,6 @@ typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (lld_busy_fn) (struct request_queue *q);
 typedef int (bsg_job_fn) (struct bsg_job *);
 typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
@@ -466,7 +465,6 @@ struct request_queue {
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
-	lld_busy_fn		*lld_busy_fn;
 	/* Called just after a request is allocated */
 	init_rq_fn		*init_rq_fn;
 	/* Called just before a request is freed */
@@ -1255,7 +1253,6 @@ extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
-extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);

From 583d6535cb9dadfd2678acfad27231076eeccf8e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Oct 2018 11:07:57 +0200
Subject: [PATCH 016/351] dasd: remove dead code

Since e443343e509a we haven't had a request_fn attached to
this driver, hence any code inside an if (q->request_fn) is
unreachable.

Fixes: e443343e509a ("s390/dasd: blk-mq conversion")
[sth: Keep and fix the dasd_info->chanq_len counter.]
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Jan Hoeppner <hoeppner@linux.ibm.com>
Signed-off-by: Stefan Haberland <sth@linux.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/s390/block/dasd_ioctl.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index 2016e0ed5865..8e26001dc11c 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -412,6 +412,7 @@ static int dasd_ioctl_information(struct dasd_block *block,
 	struct ccw_dev_id dev_id;
 	struct dasd_device *base;
 	struct ccw_device *cdev;
+	struct list_head *l;
 	unsigned long flags;
 	int rc;
 
@@ -462,23 +463,10 @@ static int dasd_ioctl_information(struct dasd_block *block,
 
 	memcpy(dasd_info->type, base->discipline->name, 4);
 
-	if (block->request_queue->request_fn) {
-		struct list_head *l;
-#ifdef DASD_EXTENDED_PROFILING
-		{
-			struct list_head *l;
-			spin_lock_irqsave(&block->lock, flags);
-			list_for_each(l, &block->request_queue->queue_head)
-				dasd_info->req_queue_len++;
-			spin_unlock_irqrestore(&block->lock, flags);
-		}
-#endif				/* DASD_EXTENDED_PROFILING */
-		spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags);
-		list_for_each(l, &base->ccw_queue)
-			dasd_info->chanq_len++;
-		spin_unlock_irqrestore(get_ccwdev_lock(base->cdev),
-				       flags);
-	}
+	spin_lock_irqsave(&block->queue_lock, flags);
+	list_for_each(l, &base->ccw_queue)
+		dasd_info->chanq_len++;
+	spin_unlock_irqrestore(&block->queue_lock, flags);
 
 	rc = 0;
 	if (copy_to_user(argp, dasd_info,

From aae3b069d5ce865ca5ef2902c2a22cef7ab4f3a2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:26:25 -0600
Subject: [PATCH 017/351] bsg: pass in desired timeout handler

This will ease in the conversion to blk-mq, where we can't set
a timeout handler after queue init.

Cc: Johannes Thumshirn <jthumshirn@suse.de>
Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c                     | 3 ++-
 drivers/scsi/scsi_transport_fc.c    | 7 +++----
 drivers/scsi/scsi_transport_iscsi.c | 2 +-
 drivers/scsi/scsi_transport_sas.c   | 4 ++--
 drivers/scsi/ufs/ufs_bsg.c          | 2 +-
 include/linux/bsg-lib.h             | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f3501cdaf1a6..1da011ec04e6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -304,7 +304,7 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
  * @dd_job_size: size of LLD data needed for each job
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size)
+		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size)
 {
 	struct request_queue *q;
 	int ret;
@@ -327,6 +327,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+	blk_queue_rq_timed_out(q, timeout);
 
 	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 381668fa135d..98aaffb4c715 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3780,7 +3780,8 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	snprintf(bsg_name, sizeof(bsg_name),
 		 "fc_host%d", shost->host_no);
 
-	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, i->f->dd_bsg_size);
+	q = bsg_setup_queue(dev, bsg_name, fc_bsg_dispatch, fc_bsg_job_timeout,
+				i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev,
 			"fc_host%d: bsg interface failed to initialize - setup queue\n",
@@ -3788,7 +3789,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
-	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, FC_DEFAULT_BSG_TIMEOUT);
 	fc_host->rqst_q = q;
 	return 0;
@@ -3826,14 +3826,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 		return -ENOTSUPP;
 
 	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
-			i->f->dd_bsg_size);
+				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
 	blk_queue_prep_rq(q, fc_bsg_rport_prep);
-	blk_queue_rq_timed_out(q, fc_bsg_job_timeout);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	rport->rqst_q = q;
 	return 0;
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 6fd2fe210fc3..26b11a775be9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1542,7 +1542,7 @@ iscsi_bsg_host_add(struct Scsi_Host *shost, struct iscsi_cls_host *ihost)
 		return -ENOTSUPP;
 
 	snprintf(bsg_name, sizeof(bsg_name), "iscsi_host%d", shost->host_no);
-	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, 0);
+	q = bsg_setup_queue(dev, bsg_name, iscsi_bsg_host_dispatch, NULL, 0);
 	if (IS_ERR(q)) {
 		shost_printk(KERN_ERR, shost, "bsg interface failed to "
 			     "initialize - no request queue\n");
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 0a165b2b3e81..cf6d47891d77 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -198,7 +198,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 	if (rphy) {
 		q = bsg_setup_queue(&rphy->dev, dev_name(&rphy->dev),
-				sas_smp_dispatch, 0);
+				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		rphy->q = q;
@@ -207,7 +207,7 @@ static int sas_bsg_initialize(struct Scsi_Host *shost, struct sas_rphy *rphy)
 
 		snprintf(name, sizeof(name), "sas_host%d", shost->host_no);
 		q = bsg_setup_queue(&shost->shost_gendev, name,
-				sas_smp_dispatch, 0);
+				sas_smp_dispatch, NULL, 0);
 		if (IS_ERR(q))
 			return PTR_ERR(q);
 		to_sas_host_attrs(shost)->q = q;
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index e5f8e54bf644..dd0e9700a74c 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -193,7 +193,7 @@ int ufs_bsg_probe(struct ufs_hba *hba)
 	if (ret)
 		goto out;
 
-	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, 0);
+	q = bsg_setup_queue(bsg_dev, dev_name(bsg_dev), ufs_bsg_request, NULL, 0);
 	if (IS_ERR(q)) {
 		ret = PTR_ERR(q);
 		goto out;
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 6aeaf6472665..b13ae143e7ef 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -72,7 +72,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, int dd_job_size);
+		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 

From 5e28b8d8a1b03ce86f33d38a64a4983d2b5c7679 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 26 Oct 2018 11:27:02 -0600
Subject: [PATCH 018/351] bsg: provide bsg_remove_queue() helper

All drivers do unregister + cleanup, provide a helper for that.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c                     | 9 +++++++++
 drivers/scsi/scsi_transport_fc.c    | 5 +----
 drivers/scsi/scsi_transport_iscsi.c | 5 +----
 drivers/scsi/scsi_transport_sas.c   | 6 +-----
 drivers/scsi/ufs/ufs_bsg.c          | 2 +-
 include/linux/bsg-lib.h             | 1 +
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 1da011ec04e6..3f2e9a1bae44 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -296,6 +296,15 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
 	kfree(job->reply);
 }
 
+void bsg_remove_queue(struct request_queue *q)
+{
+	if (q) {
+		bsg_unregister_queue(q);
+		blk_cleanup_queue(q);
+	}
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
+
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 98aaffb4c715..638f83ab04b2 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3851,10 +3851,7 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 static void
 fc_bsg_remove(struct request_queue *q)
 {
-	if (q) {
-		bsg_unregister_queue(q);
-		blk_cleanup_queue(q);
-	}
+	bsg_remove_queue(q);
 }
 
 
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 26b11a775be9..ff123023e5a5 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -1576,10 +1576,7 @@ static int iscsi_remove_host(struct transport_container *tc,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct iscsi_cls_host *ihost = shost->shost_data;
 
-	if (ihost->bsg_q) {
-		bsg_unregister_queue(ihost->bsg_q);
-		blk_cleanup_queue(ihost->bsg_q);
-	}
+	bsg_remove_queue(ihost->bsg_q);
 	return 0;
 }
 
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index cf6d47891d77..692b46937e52 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -246,11 +246,7 @@ static int sas_host_remove(struct transport_container *tc, struct device *dev,
 	struct Scsi_Host *shost = dev_to_shost(dev);
 	struct request_queue *q = to_sas_host_attrs(shost)->q;
 
-	if (q) {
-		bsg_unregister_queue(q);
-		blk_cleanup_queue(q);
-	}
-
+	bsg_remove_queue(q);
 	return 0;
 }
 
diff --git a/drivers/scsi/ufs/ufs_bsg.c b/drivers/scsi/ufs/ufs_bsg.c
index dd0e9700a74c..775bb4e5e36e 100644
--- a/drivers/scsi/ufs/ufs_bsg.c
+++ b/drivers/scsi/ufs/ufs_bsg.c
@@ -157,7 +157,7 @@ void ufs_bsg_remove(struct ufs_hba *hba)
 	if (!hba->bsg_queue)
 		return;
 
-	bsg_unregister_queue(hba->bsg_queue);
+	bsg_remove_queue(hba->bsg_queue);
 
 	device_del(bsg_dev);
 	put_device(bsg_dev);
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index b13ae143e7ef..9c9b134b1fa5 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -73,6 +73,7 @@ void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
+void bsg_remove_queue(struct request_queue *q);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);
 

From cd2f076f1d7ac20a93029ab38646b303f1c1468e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 07:11:39 -0600
Subject: [PATCH 019/351] bsg: convert to use blk-mq

Requires a few changes to the FC transport class as well.

Cc: linux-scsi@vger.kernel.org
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Tested-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c                  | 119 +++++++++++++++++++------------
 drivers/scsi/scsi_transport_fc.c |  59 ++++++++-------
 2 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 3f2e9a1bae44..faf20f4500c9 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -21,7 +21,7 @@
  *
  */
 #include <linux/slab.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/bsg-lib.h>
@@ -129,7 +129,7 @@ static void bsg_teardown_job(struct kref *kref)
 	kfree(job->request_payload.sg_list);
 	kfree(job->reply_payload.sg_list);
 
-	blk_end_request_all(rq, BLK_STS_OK);
+	blk_mq_end_request(rq, BLK_STS_OK);
 }
 
 void bsg_job_put(struct bsg_job *job)
@@ -157,15 +157,15 @@ void bsg_job_done(struct bsg_job *job, int result,
 {
 	job->result = result;
 	job->reply_payload_rcv_len = reply_payload_rcv_len;
-	blk_complete_request(blk_mq_rq_from_pdu(job));
+	blk_mq_complete_request(blk_mq_rq_from_pdu(job));
 }
 EXPORT_SYMBOL_GPL(bsg_job_done);
 
 /**
- * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * bsg_complete - softirq done routine for destroying the bsg requests
  * @rq: BSG request that holds the job to be destroyed
  */
-static void bsg_softirq_done(struct request *rq)
+static void bsg_complete(struct request *rq)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(rq);
 
@@ -224,54 +224,46 @@ failjob_rls_job:
 }
 
 /**
- * bsg_request_fn - generic handler for bsg requests
- * @q: request queue to manage
+ * bsg_queue_rq - generic handler for bsg requests
+ * @hctx: hardware queue
+ * @bd: queue data
  *
  * On error the create_bsg_job function should return a -Exyz error value
  * that will be set to ->result.
  *
  * Drivers/subsys should pass this to the queue init function.
  */
-static void bsg_request_fn(struct request_queue *q)
-	__releases(q->queue_lock)
-	__acquires(q->queue_lock)
+static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
 {
+	struct request_queue *q = hctx->queue;
 	struct device *dev = q->queuedata;
-	struct request *req;
+	struct request *req = bd->rq;
 	int ret;
 
+	blk_mq_start_request(req);
+
 	if (!get_device(dev))
-		return;
+		return BLK_STS_IOERR;
 
-	while (1) {
-		req = blk_fetch_request(q);
-		if (!req)
-			break;
-		spin_unlock_irq(q->queue_lock);
+	if (!bsg_prepare_job(dev, req))
+		return BLK_STS_IOERR;
 
-		if (!bsg_prepare_job(dev, req)) {
-			blk_end_request_all(req, BLK_STS_OK);
-			spin_lock_irq(q->queue_lock);
-			continue;
-		}
+	ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
+	if (ret)
+		return BLK_STS_IOERR;
 
-		ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
-		spin_lock_irq(q->queue_lock);
-		if (ret)
-			break;
-	}
-
-	spin_unlock_irq(q->queue_lock);
 	put_device(dev);
-	spin_lock_irq(q->queue_lock);
+	return BLK_STS_OK;
 }
 
 /* called right after the request is allocated for the request_queue */
-static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp)
+static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
+		       unsigned int hctx_idx, unsigned int numa_node)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 
-	job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp);
+	job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL);
 	if (!job->reply)
 		return -ENOMEM;
 	return 0;
@@ -289,7 +281,8 @@ static void bsg_initialize_rq(struct request *req)
 	job->dd_data = job + 1;
 }
 
-static void bsg_exit_rq(struct request_queue *q, struct request *req)
+static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
+		       unsigned int hctx_idx)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 
@@ -299,12 +292,36 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req)
 void bsg_remove_queue(struct request_queue *q)
 {
 	if (q) {
+		struct blk_mq_tag_set *set = q->tag_set;
+
 		bsg_unregister_queue(q);
 		blk_cleanup_queue(q);
+		blk_mq_free_tag_set(set);
+		kfree(set);
 	}
 }
 EXPORT_SYMBOL_GPL(bsg_remove_queue);
 
+static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
+{
+	enum blk_eh_timer_return ret = BLK_EH_DONE;
+	struct request_queue *q = rq->q;
+
+	if (q->rq_timed_out_fn)
+		ret = q->rq_timed_out_fn(rq);
+
+	return ret;
+}
+
+static const struct blk_mq_ops bsg_mq_ops = {
+	.queue_rq		= bsg_queue_rq,
+	.init_request		= bsg_init_rq,
+	.exit_request		= bsg_exit_rq,
+	.initialize_rq_fn	= bsg_initialize_rq,
+	.complete		= bsg_complete,
+	.timeout		= bsg_timeout,
+};
+
 /**
  * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
  * @dev: device to attach bsg device to
@@ -315,28 +332,34 @@ EXPORT_SYMBOL_GPL(bsg_remove_queue);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size)
 {
+	struct blk_mq_tag_set *set;
 	struct request_queue *q;
-	int ret;
+	int ret = -ENOMEM;
 
-	q = blk_alloc_queue(GFP_KERNEL);
-	if (!q)
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
+	if (!set)
 		return ERR_PTR(-ENOMEM);
-	q->cmd_size = sizeof(struct bsg_job) + dd_job_size;
-	q->init_rq_fn = bsg_init_rq;
-	q->exit_rq_fn = bsg_exit_rq;
-	q->initialize_rq_fn = bsg_initialize_rq;
-	q->request_fn = bsg_request_fn;
 
-	ret = blk_init_allocated_queue(q);
-	if (ret)
-		goto out_cleanup_queue;
+	set->ops = &bsg_mq_ops,
+	set->nr_hw_queues = 1;
+	set->queue_depth = 128;
+	set->numa_node = NUMA_NO_NODE;
+	set->cmd_size = sizeof(struct bsg_job) + dd_job_size;
+	set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING;
+	if (blk_mq_alloc_tag_set(set))
+		goto out_tag_set;
+
+	q = blk_mq_init_queue(set);
+	if (IS_ERR(q)) {
+		ret = PTR_ERR(q);
+		goto out_queue;
+	}
 
 	q->queuedata = dev;
 	q->bsg_job_fn = job_fn;
 	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
-	blk_queue_softirq_done(q, bsg_softirq_done);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
-	blk_queue_rq_timed_out(q, timeout);
+	q->rq_timed_out_fn = timeout;
 
 	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
@@ -348,6 +371,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	return q;
 out_cleanup_queue:
 	blk_cleanup_queue(q);
+out_queue:
+	blk_mq_free_tag_set(set);
+out_tag_set:
+	kfree(set);
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 638f83ab04b2..d7035270d274 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -3592,7 +3592,7 @@ fc_bsg_job_timeout(struct request *req)
 
 	/* the blk_end_sync_io() doesn't check the error */
 	if (inflight)
-		__blk_complete_request(req);
+		blk_mq_end_request(req, BLK_STS_IOERR);
 	return BLK_EH_DONE;
 }
 
@@ -3684,14 +3684,9 @@ static void
 fc_bsg_goose_queue(struct fc_rport *rport)
 {
 	struct request_queue *q = rport->rqst_q;
-	unsigned long flags;
 
-	if (!q)
-		return;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_run_queue_async(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	if (q)
+		blk_mq_run_hw_queues(q, true);
 }
 
 /**
@@ -3759,6 +3754,37 @@ static int fc_bsg_dispatch(struct bsg_job *job)
 		return fc_bsg_host_dispatch(shost, job);
 }
 
+static blk_status_t fc_bsg_rport_prep(struct fc_rport *rport)
+{
+	if (rport->port_state == FC_PORTSTATE_BLOCKED &&
+	    !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
+		return BLK_STS_RESOURCE;
+
+	if (rport->port_state != FC_PORTSTATE_ONLINE)
+		return BLK_STS_IOERR;
+
+	return BLK_STS_OK;
+}
+
+
+static int fc_bsg_dispatch_prep(struct bsg_job *job)
+{
+	struct fc_rport *rport = fc_bsg_to_rport(job);
+	blk_status_t ret;
+
+	ret = fc_bsg_rport_prep(rport);
+	switch (ret) {
+	case BLK_STS_OK:
+		break;
+	case BLK_STS_RESOURCE:
+		return -EAGAIN;
+	default:
+		return -EIO;
+	}
+
+	return fc_bsg_dispatch(job);
+}
+
 /**
  * fc_bsg_hostadd - Create and add the bsg hooks so we can receive requests
  * @shost:	shost for fc_host
@@ -3794,20 +3820,6 @@ fc_bsg_hostadd(struct Scsi_Host *shost, struct fc_host_attrs *fc_host)
 	return 0;
 }
 
-static int fc_bsg_rport_prep(struct request_queue *q, struct request *req)
-{
-	struct fc_rport *rport = dev_to_rport(q->queuedata);
-
-	if (rport->port_state == FC_PORTSTATE_BLOCKED &&
-	    !(rport->flags & FC_RPORT_FAST_FAIL_TIMEDOUT))
-		return BLKPREP_DEFER;
-
-	if (rport->port_state != FC_PORTSTATE_ONLINE)
-		return BLKPREP_KILL;
-
-	return BLKPREP_OK;
-}
-
 /**
  * fc_bsg_rportadd - Create and add the bsg hooks so we can receive requests
  * @shost:	shost that rport is attached to
@@ -3825,14 +3837,13 @@ fc_bsg_rportadd(struct Scsi_Host *shost, struct fc_rport *rport)
 	if (!i->f->bsg_request)
 		return -ENOTSUPP;
 
-	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch,
+	q = bsg_setup_queue(dev, dev_name(dev), fc_bsg_dispatch_prep,
 				fc_bsg_job_timeout, i->f->dd_bsg_size);
 	if (IS_ERR(q)) {
 		dev_err(dev, "failed to setup bsg queue\n");
 		return PTR_ERR(q);
 	}
 	__scsi_init_queue(shost, q);
-	blk_queue_prep_rq(q, fc_bsg_rport_prep);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 	rport->rqst_q = q;
 	return 0;

From 771a93c489bf486b957c7399f89ee06d43ba2d93 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Oct 2018 05:12:32 -0600
Subject: [PATCH 020/351] block: remove blk_complete_request()

It's now unused.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-softirq.c    | 20 --------------------
 include/linux/blkdev.h |  1 -
 2 files changed, 21 deletions(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index e47a2f751884..8ca0f6caf174 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -145,26 +145,6 @@ do_local:
 }
 EXPORT_SYMBOL(__blk_complete_request);
 
-/**
- * blk_complete_request - end I/O on a request
- * @req:      the request being processed
- *
- * Description:
- *     Ends all I/O on a request. It does not handle partial completions,
- *     unless the driver actually implements this in its completion callback
- *     through requeueing. The actual completion happens out-of-order,
- *     through a softirq handler. The user must have registered a completion
- *     callback through blk_queue_softirq_done().
- **/
-void blk_complete_request(struct request *req)
-{
-	if (unlikely(blk_should_fake_timeout(req->q)))
-		return;
-	if (!blk_mark_rq_complete(req))
-		__blk_complete_request(req);
-}
-EXPORT_SYMBOL(blk_complete_request);
-
 static __init int blk_softirq_init(void)
 {
 	int i;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e867733b761d..6baea6563364 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1203,7 +1203,6 @@ extern bool __blk_end_request(struct request *rq, blk_status_t error,
 extern void __blk_end_request_all(struct request *rq, blk_status_t error);
 extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
 
-extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_unprep_request(struct request *);

From 3c7741567b8188f55e3704e56bed96460bf8d396 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Oct 2018 10:04:48 -0600
Subject: [PATCH 021/351] blk-wbt: kill check for legacy queue type

Everything is blk-mq at this point, so it doesn't make any sense
to have this option available as it does nothing.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/Kconfig   | 6 ------
 block/blk-wbt.c | 3 +--
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index f7045aa47edb..8044452a4fd3 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY
 
 	Note, this is an experimental interface and could be changed someday.
 
-config BLK_WBT_SQ
-	bool "Single queue writeback throttling"
-	depends on BLK_WBT
-	---help---
-	Enable writeback throttling by default on legacy single queue devices
-
 config BLK_WBT_MQ
 	bool "Multiqueue writeback throttling"
 	default y
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 8ac93fcbaa2e..0fc222d4194b 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -709,8 +709,7 @@ void wbt_enable_default(struct request_queue *q)
 	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 		return;
 
-	if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) ||
-	    (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ)))
+	if (q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ))
 		wbt_init(q);
 }
 EXPORT_SYMBOL_GPL(wbt_enable_default);

From 2cdf2caecda6cb16c24c6bdd2484d4cec99cfbb3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Oct 2018 17:46:00 -0600
Subject: [PATCH 022/351] blk-cgroup: remove legacy queue bypassing

We only support mq devices now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c630e02836a8..41b2470042d1 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1398,8 +1398,6 @@ int blkcg_activate_policy(struct request_queue *q,
 
 	if (q->mq_ops)
 		blk_mq_freeze_queue(q);
-	else
-		blk_queue_bypass_start(q);
 pd_prealloc:
 	if (!pd_prealloc) {
 		pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
@@ -1439,8 +1437,6 @@ pd_prealloc:
 out_bypass_end:
 	if (q->mq_ops)
 		blk_mq_unfreeze_queue(q);
-	else
-		blk_queue_bypass_end(q);
 	if (pd_prealloc)
 		pol->pd_free_fn(pd_prealloc);
 	return ret;
@@ -1465,8 +1461,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	if (q->mq_ops)
 		blk_mq_freeze_queue(q);
-	else
-		blk_queue_bypass_start(q);
 
 	spin_lock_irq(q->queue_lock);
 
@@ -1485,8 +1479,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	if (q->mq_ops)
 		blk_mq_unfreeze_queue(q);
-	else
-		blk_queue_bypass_end(q);
 }
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
 

From 7ca01926463a15f5d2681458643b2453930b873a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 03:39:36 -0600
Subject: [PATCH 023/351] block: remove legacy rq tagging

It's now unused, kill it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/biodoc.txt |  88 --------
 block/Makefile                 |   2 +-
 block/blk-core.c               |   6 -
 block/blk-mq-debugfs.c         |   2 -
 block/blk-mq-tag.c             |   6 +-
 block/blk-sysfs.c              |   3 -
 block/blk-tag.c                | 378 ---------------------------------
 include/linux/blkdev.h         |  35 ---
 8 files changed, 3 insertions(+), 517 deletions(-)
 delete mode 100644 block/blk-tag.c

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 207eca58efaa..ac18b488cb5e 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -65,7 +65,6 @@ Description of Contents:
     3.2.3 I/O completion
     3.2.4 Implications for drivers that do not interpret bios (don't handle
  	  multiple segments)
-    3.2.5 Request command tagging
   3.3 I/O submission
 4. The I/O scheduler
 5. Scalability related changes
@@ -708,93 +707,6 @@ is crossed on completion of a transfer. (The end*request* functions should
 be used if only if the request has come down from block/bio path, not for
 direct access requests which only specify rq->buffer without a valid rq->bio)
 
-3.2.5 Generic request command tagging
-
-3.2.5.1 Tag helpers
-
-Block now offers some simple generic functionality to help support command
-queueing (typically known as tagged command queueing), ie manage more than
-one outstanding command on a queue at any given time.
-
-	blk_queue_init_tags(struct request_queue *q, int depth)
-
-	Initialize internal command tagging structures for a maximum
-	depth of 'depth'.
-
-	blk_queue_free_tags((struct request_queue *q)
-
-	Teardown tag info associated with the queue. This will be done
-	automatically by block if blk_queue_cleanup() is called on a queue
-	that is using tagging.
-
-The above are initialization and exit management, the main helpers during
-normal operations are:
-
-	blk_queue_start_tag(struct request_queue *q, struct request *rq)
-
-	Start tagged operation for this request. A free tag number between
-	0 and 'depth' is assigned to the request (rq->tag holds this number),
-	and 'rq' is added to the internal tag management. If the maximum depth
-	for this queue is already achieved (or if the tag wasn't started for
-	some other reason), 1 is returned. Otherwise 0 is returned.
-
-	blk_queue_end_tag(struct request_queue *q, struct request *rq)
-
-	End tagged operation on this request. 'rq' is removed from the internal
-	book keeping structures.
-
-To minimize struct request and queue overhead, the tag helpers utilize some
-of the same request members that are used for normal request queue management.
-This means that a request cannot both be an active tag and be on the queue
-list at the same time. blk_queue_start_tag() will remove the request, but
-the driver must remember to call blk_queue_end_tag() before signalling
-completion of the request to the block layer. This means ending tag
-operations before calling end_that_request_last()! For an example of a user
-of these helpers, see the IDE tagged command queueing support.
-
-3.2.5.2 Tag info
-
-Some block functions exist to query current tag status or to go from a
-tag number to the associated request. These are, in no particular order:
-
-	blk_queue_tagged(q)
-
-	Returns 1 if the queue 'q' is using tagging, 0 if not.
-
-	blk_queue_tag_request(q, tag)
-
-	Returns a pointer to the request associated with tag 'tag'.
-
-	blk_queue_tag_depth(q)
-	
-	Return current queue depth.
-
-	blk_queue_tag_queue(q)
-
-	Returns 1 if the queue can accept a new queued command, 0 if we are
-	at the maximum depth already.
-
-	blk_queue_rq_tagged(rq)
-
-	Returns 1 if the request 'rq' is tagged.
-
-3.2.5.2 Internal structure
-
-Internally, block manages tags in the blk_queue_tag structure:
-
-	struct blk_queue_tag {
-		struct request **tag_index;	/* array or pointers to rq */
-		unsigned long *tag_map;		/* bitmap of free tags */
-		struct list_head busy_list;	/* fifo list of busy tags */
-		int busy;			/* queue depth */
-		int max_depth;			/* max queue depth */
-	};
-
-Most of the above is simple and straight forward, however busy_list may need
-a bit of explaining. Normally we don't care too much about request ordering,
-but in the event of any barrier requests in the tag queue we need to ensure
-that requests are restarted in the order they were queue.
-
 3.3 I/O Submission
 
 The routine submit_bio() is used to submit a single io. Higher level i/o
diff --git a/block/Makefile b/block/Makefile
index 27eac600474f..213674c8faaa 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,7 +3,7 @@
 # Makefile for the kernel block layer
 #
 
-obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
diff --git a/block/blk-core.c b/block/blk-core.c
index 03ef8f0e7dc5..daaed4dfa719 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1658,9 +1658,6 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	trace_block_rq_requeue(q, rq);
 	rq_qos_requeue(q, rq);
 
-	if (rq->rq_flags & RQF_QUEUED)
-		blk_queue_end_tag(q, rq);
-
 	BUG_ON(blk_queued_rq(rq));
 
 	elv_requeue_request(q, rq);
@@ -3174,9 +3171,6 @@ void blk_finish_request(struct request *req, blk_status_t error)
 	if (req->rq_flags & RQF_STATS)
 		blk_stat_add(req, now);
 
-	if (req->rq_flags & RQF_QUEUED)
-		blk_queue_end_tag(q, req);
-
 	BUG_ON(blk_queued_rq(req));
 
 	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 10b284a1f18d..9ed43a7c70b5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -112,7 +112,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
 
 #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name
 static const char *const blk_queue_flag_name[] = {
-	QUEUE_FLAG_NAME(QUEUED),
 	QUEUE_FLAG_NAME(STOPPED),
 	QUEUE_FLAG_NAME(DYING),
 	QUEUE_FLAG_NAME(BYPASS),
@@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = {
 static const char *const rqf_name[] = {
 	RQF_NAME(SORTED),
 	RQF_NAME(STARTED),
-	RQF_NAME(QUEUED),
 	RQF_NAME(SOFTBARRIER),
 	RQF_NAME(FLUSH_SEQ),
 	RQF_NAME(MIXED_MERGE),
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index cfda95b85d34..4254e74c1446 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -530,10 +530,8 @@ u32 blk_mq_unique_tag(struct request *rq)
 	struct blk_mq_hw_ctx *hctx;
 	int hwq = 0;
 
-	if (q->mq_ops) {
-		hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
-		hwq = hctx->queue_num;
-	}
+	hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+	hwq = hctx->queue_num;
 
 	return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
 		(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 844a454a7b3a..1b82ccfde3fe 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -849,9 +849,6 @@ static void __blk_release_queue(struct work_struct *work)
 
 	blk_exit_rl(q, &q->root_rl);
 
-	if (q->queue_tags)
-		__blk_queue_free_tags(q);
-
 	blk_queue_free_zone_bitmaps(q);
 
 	if (!q->mq_ops) {
diff --git a/block/blk-tag.c b/block/blk-tag.c
deleted file mode 100644
index fbc153aef166..000000000000
--- a/block/blk-tag.c
+++ /dev/null
@@ -1,378 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to tagged command queuing
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-
-#include "blk.h"
-
-/**
- * blk_queue_find_tag - find a request by its tag and queue
- * @q:	 The request queue for the device
- * @tag: The tag of the request
- *
- * Notes:
- *    Should be used when a device returns a tag and you want to match
- *    it with a request.
- *
- *    no locks need be held.
- **/
-struct request *blk_queue_find_tag(struct request_queue *q, int tag)
-{
-	return blk_map_queue_find_tag(q->queue_tags, tag);
-}
-EXPORT_SYMBOL(blk_queue_find_tag);
-
-/**
- * blk_free_tags - release a given set of tag maintenance info
- * @bqt:	the tag map to free
- *
- * Drop the reference count on @bqt and frees it when the last reference
- * is dropped.
- */
-void blk_free_tags(struct blk_queue_tag *bqt)
-{
-	if (atomic_dec_and_test(&bqt->refcnt)) {
-		BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) <
-							bqt->max_depth);
-
-		kfree(bqt->tag_index);
-		bqt->tag_index = NULL;
-
-		kfree(bqt->tag_map);
-		bqt->tag_map = NULL;
-
-		kfree(bqt);
-	}
-}
-EXPORT_SYMBOL(blk_free_tags);
-
-/**
- * __blk_queue_free_tags - release tag maintenance info
- * @q:  the request queue for the device
- *
- *  Notes:
- *    blk_cleanup_queue() will take care of calling this function, if tagging
- *    has been used. So there's no need to call this directly.
- **/
-void __blk_queue_free_tags(struct request_queue *q)
-{
-	struct blk_queue_tag *bqt = q->queue_tags;
-
-	if (!bqt)
-		return;
-
-	blk_free_tags(bqt);
-
-	q->queue_tags = NULL;
-	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
-}
-
-/**
- * blk_queue_free_tags - release tag maintenance info
- * @q:  the request queue for the device
- *
- *  Notes:
- *	This is used to disable tagged queuing to a device, yet leave
- *	queue in function.
- **/
-void blk_queue_free_tags(struct request_queue *q)
-{
-	queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q);
-}
-EXPORT_SYMBOL(blk_queue_free_tags);
-
-static int
-init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
-{
-	struct request **tag_index;
-	unsigned long *tag_map;
-	int nr_ulongs;
-
-	if (q && depth > q->nr_requests * 2) {
-		depth = q->nr_requests * 2;
-		printk(KERN_ERR "%s: adjusted depth to %d\n",
-		       __func__, depth);
-	}
-
-	tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC);
-	if (!tag_index)
-		goto fail;
-
-	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
-	tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC);
-	if (!tag_map)
-		goto fail;
-
-	tags->real_max_depth = depth;
-	tags->max_depth = depth;
-	tags->tag_index = tag_index;
-	tags->tag_map = tag_map;
-
-	return 0;
-fail:
-	kfree(tag_index);
-	return -ENOMEM;
-}
-
-static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
-						int depth, int alloc_policy)
-{
-	struct blk_queue_tag *tags;
-
-	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
-	if (!tags)
-		goto fail;
-
-	if (init_tag_map(q, tags, depth))
-		goto fail;
-
-	atomic_set(&tags->refcnt, 1);
-	tags->alloc_policy = alloc_policy;
-	tags->next_tag = 0;
-	return tags;
-fail:
-	kfree(tags);
-	return NULL;
-}
-
-/**
- * blk_init_tags - initialize the tag info for an external tag map
- * @depth:	the maximum queue depth supported
- * @alloc_policy: tag allocation policy
- **/
-struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy)
-{
-	return __blk_queue_init_tags(NULL, depth, alloc_policy);
-}
-EXPORT_SYMBOL(blk_init_tags);
-
-/**
- * blk_queue_init_tags - initialize the queue tag info
- * @q:  the request queue for the device
- * @depth:  the maximum queue depth supported
- * @tags: the tag to use
- * @alloc_policy: tag allocation policy
- *
- * Queue lock must be held here if the function is called to resize an
- * existing map.
- **/
-int blk_queue_init_tags(struct request_queue *q, int depth,
-			struct blk_queue_tag *tags, int alloc_policy)
-{
-	int rc;
-
-	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
-
-	if (!tags && !q->queue_tags) {
-		tags = __blk_queue_init_tags(q, depth, alloc_policy);
-
-		if (!tags)
-			return -ENOMEM;
-
-	} else if (q->queue_tags) {
-		rc = blk_queue_resize_tags(q, depth);
-		if (rc)
-			return rc;
-		queue_flag_set(QUEUE_FLAG_QUEUED, q);
-		return 0;
-	} else
-		atomic_inc(&tags->refcnt);
-
-	/*
-	 * assign it, all done
-	 */
-	q->queue_tags = tags;
-	queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q);
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_init_tags);
-
-/**
- * blk_queue_resize_tags - change the queueing depth
- * @q:  the request queue for the device
- * @new_depth: the new max command queueing depth
- *
- *  Notes:
- *    Must be called with the queue lock held.
- **/
-int blk_queue_resize_tags(struct request_queue *q, int new_depth)
-{
-	struct blk_queue_tag *bqt = q->queue_tags;
-	struct request **tag_index;
-	unsigned long *tag_map;
-	int max_depth, nr_ulongs;
-
-	if (!bqt)
-		return -ENXIO;
-
-	/*
-	 * if we already have large enough real_max_depth.  just
-	 * adjust max_depth.  *NOTE* as requests with tag value
-	 * between new_depth and real_max_depth can be in-flight, tag
-	 * map can not be shrunk blindly here.
-	 */
-	if (new_depth <= bqt->real_max_depth) {
-		bqt->max_depth = new_depth;
-		return 0;
-	}
-
-	/*
-	 * Currently cannot replace a shared tag map with a new
-	 * one, so error out if this is the case
-	 */
-	if (atomic_read(&bqt->refcnt) != 1)
-		return -EBUSY;
-
-	/*
-	 * save the old state info, so we can copy it back
-	 */
-	tag_index = bqt->tag_index;
-	tag_map = bqt->tag_map;
-	max_depth = bqt->real_max_depth;
-
-	if (init_tag_map(q, bqt, new_depth))
-		return -ENOMEM;
-
-	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
-	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
-	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
-
-	kfree(tag_index);
-	kfree(tag_map);
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_resize_tags);
-
-/**
- * blk_queue_end_tag - end tag operations for a request
- * @q:  the request queue for the device
- * @rq: the request that has completed
- *
- *  Description:
- *    Typically called when end_that_request_first() returns %0, meaning
- *    all transfers have been done for a request. It's important to call
- *    this function before end_that_request_last(), as that will put the
- *    request back on the free list thus corrupting the internal tag list.
- **/
-void blk_queue_end_tag(struct request_queue *q, struct request *rq)
-{
-	struct blk_queue_tag *bqt = q->queue_tags;
-	unsigned tag = rq->tag; /* negative tags invalid */
-
-	lockdep_assert_held(q->queue_lock);
-
-	BUG_ON(tag >= bqt->real_max_depth);
-
-	list_del_init(&rq->queuelist);
-	rq->rq_flags &= ~RQF_QUEUED;
-	rq->tag = -1;
-	rq->internal_tag = -1;
-
-	if (unlikely(bqt->tag_index[tag] == NULL))
-		printk(KERN_ERR "%s: tag %d is missing\n",
-		       __func__, tag);
-
-	bqt->tag_index[tag] = NULL;
-
-	if (unlikely(!test_bit(tag, bqt->tag_map))) {
-		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
-		       __func__, tag);
-		return;
-	}
-	/*
-	 * The tag_map bit acts as a lock for tag_index[bit], so we need
-	 * unlock memory barrier semantics.
-	 */
-	clear_bit_unlock(tag, bqt->tag_map);
-}
-
-/**
- * blk_queue_start_tag - find a free tag and assign it
- * @q:  the request queue for the device
- * @rq:  the block request that needs tagging
- *
- *  Description:
- *    This can either be used as a stand-alone helper, or possibly be
- *    assigned as the queue &prep_rq_fn (in which case &struct request
- *    automagically gets a tag assigned). Note that this function
- *    assumes that any type of request can be queued! if this is not
- *    true for your device, you must check the request type before
- *    calling this function.  The request will also be removed from
- *    the request queue, so it's the drivers responsibility to readd
- *    it if it should need to be restarted for some reason.
- **/
-int blk_queue_start_tag(struct request_queue *q, struct request *rq)
-{
-	struct blk_queue_tag *bqt = q->queue_tags;
-	unsigned max_depth;
-	int tag;
-
-	lockdep_assert_held(q->queue_lock);
-
-	if (unlikely((rq->rq_flags & RQF_QUEUED))) {
-		printk(KERN_ERR
-		       "%s: request %p for device [%s] already tagged %d",
-		       __func__, rq,
-		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
-		BUG();
-	}
-
-	/*
-	 * Protect against shared tag maps, as we may not have exclusive
-	 * access to the tag map.
-	 *
-	 * We reserve a few tags just for sync IO, since we don't want
-	 * to starve sync IO on behalf of flooding async IO.
-	 */
-	max_depth = bqt->max_depth;
-	if (!rq_is_sync(rq) && max_depth > 1) {
-		switch (max_depth) {
-		case 2:
-			max_depth = 1;
-			break;
-		case 3:
-			max_depth = 2;
-			break;
-		default:
-			max_depth -= 2;
-		}
-		if (q->in_flight[BLK_RW_ASYNC] > max_depth)
-			return 1;
-	}
-
-	do {
-		if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) {
-			tag = find_first_zero_bit(bqt->tag_map, max_depth);
-			if (tag >= max_depth)
-				return 1;
-		} else {
-			int start = bqt->next_tag;
-			int size = min_t(int, bqt->max_depth, max_depth + start);
-			tag = find_next_zero_bit(bqt->tag_map, size, start);
-			if (tag >= size && start + size > bqt->max_depth) {
-				size = start + size - bqt->max_depth;
-				tag = find_first_zero_bit(bqt->tag_map, size);
-			}
-			if (tag >= size)
-				return 1;
-		}
-
-	} while (test_and_set_bit_lock(tag, bqt->tag_map));
-	/*
-	 * We need lock ordering semantics given by test_and_set_bit_lock.
-	 * See blk_queue_end_tag for details.
-	 */
-
-	bqt->next_tag = (tag + 1) % bqt->max_depth;
-	rq->rq_flags |= RQF_QUEUED;
-	rq->tag = tag;
-	bqt->tag_index[tag] = rq;
-	blk_start_request(rq);
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_start_tag);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6baea6563364..8afe3331777e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -85,8 +85,6 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_SORTED		((__force req_flags_t)(1 << 0))
 /* drive already may have started this one */
 #define RQF_STARTED		((__force req_flags_t)(1 << 1))
-/* uses tagged queueing */
-#define RQF_QUEUED		((__force req_flags_t)(1 << 2))
 /* may not be passed by ioscheduler */
 #define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
 /* request for flush sequence */
@@ -336,15 +334,6 @@ enum blk_queue_state {
 	Queue_up,
 };
 
-struct blk_queue_tag {
-	struct request **tag_index;	/* map of busy tags */
-	unsigned long *tag_map;		/* bit map of free/busy tags */
-	int max_depth;			/* what we will send to device */
-	int real_max_depth;		/* what the array can hold */
-	atomic_t refcnt;		/* map can be shared */
-	int alloc_policy;		/* tag allocation policy */
-	int next_tag;			/* next tag */
-};
 #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */
 #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */
 
@@ -568,8 +557,6 @@ struct request_queue {
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 
-	struct blk_queue_tag	*queue_tags;
-
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
@@ -680,7 +667,6 @@ struct request_queue {
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
 };
 
-#define QUEUE_FLAG_QUEUED	0	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
 #define QUEUE_FLAG_DYING	2	/* queue being torn down */
 #define QUEUE_FLAG_BYPASS	3	/* act as dumb FIFO queue */
@@ -724,7 +710,6 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 
-#define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
@@ -1359,26 +1344,6 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 		 !list_empty(&plug->cb_list));
 }
 
-/*
- * tag stuff
- */
-extern int blk_queue_start_tag(struct request_queue *, struct request *);
-extern struct request *blk_queue_find_tag(struct request_queue *, int);
-extern void blk_queue_end_tag(struct request_queue *, struct request *);
-extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int);
-extern void blk_queue_free_tags(struct request_queue *);
-extern int blk_queue_resize_tags(struct request_queue *, int);
-extern struct blk_queue_tag *blk_init_tags(int, int);
-extern void blk_free_tags(struct blk_queue_tag *);
-
-static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
-						int tag)
-{
-	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
-		return NULL;
-	return bqt->tag_index[tag];
-}
-
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);

From 7e992f847a08ecda3f658ceebb39a4d8e739ba36 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 13 Oct 2018 15:44:12 -0600
Subject: [PATCH 024/351] block: remove non mq parts from the flush code

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 152 +++++++++-------------------------------------
 block/blk.h       |   4 +-
 2 files changed, 30 insertions(+), 126 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 8b44b86779da..9baa9a119447 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -134,16 +134,8 @@ static void blk_flush_restore_request(struct request *rq)
 
 static bool blk_flush_queue_rq(struct request *rq, bool add_front)
 {
-	if (rq->q->mq_ops) {
-		blk_mq_add_to_requeue_list(rq, add_front, true);
-		return false;
-	} else {
-		if (add_front)
-			list_add(&rq->queuelist, &rq->q->queue_head);
-		else
-			list_add_tail(&rq->queuelist, &rq->q->queue_head);
-		return true;
-	}
+	blk_mq_add_to_requeue_list(rq, add_front, true);
+	return false;
 }
 
 /**
@@ -204,10 +196,7 @@ static bool blk_flush_complete_seq(struct request *rq,
 		BUG_ON(!list_empty(&rq->queuelist));
 		list_del_init(&rq->flush.list);
 		blk_flush_restore_request(rq);
-		if (q->mq_ops)
-			blk_mq_end_request(rq, error);
-		else
-			__blk_end_request_all(rq, error);
+		blk_mq_end_request(rq, error);
 		break;
 
 	default:
@@ -226,20 +215,17 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	struct request *rq, *n;
 	unsigned long flags = 0;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+	struct blk_mq_hw_ctx *hctx;
 
-	if (q->mq_ops) {
-		struct blk_mq_hw_ctx *hctx;
-
-		/* release the tag's ownership to the req cloned from */
-		spin_lock_irqsave(&fq->mq_flush_lock, flags);
-		hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
-		if (!q->elevator) {
-			blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
-			flush_rq->tag = -1;
-		} else {
-			blk_mq_put_driver_tag_hctx(hctx, flush_rq);
-			flush_rq->internal_tag = -1;
-		}
+	/* release the tag's ownership to the req cloned from */
+	spin_lock_irqsave(&fq->mq_flush_lock, flags);
+	hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
+	if (!q->elevator) {
+		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+		flush_rq->tag = -1;
+	} else {
+		blk_mq_put_driver_tag_hctx(hctx, flush_rq);
+		flush_rq->internal_tag = -1;
 	}
 
 	running = &fq->flush_queue[fq->flush_running_idx];
@@ -248,9 +234,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	/* account completion of the flush request */
 	fq->flush_running_idx ^= 1;
 
-	if (!q->mq_ops)
-		elv_completed_request(q, flush_rq);
-
 	/* and push the waiting requests to the next stage */
 	list_for_each_entry_safe(rq, n, running, flush.list) {
 		unsigned int seq = blk_flush_cur_seq(rq);
@@ -259,24 +242,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		queued |= blk_flush_complete_seq(rq, fq, seq, error);
 	}
 
-	/*
-	 * Kick the queue to avoid stall for two cases:
-	 * 1. Moving a request silently to empty queue_head may stall the
-	 * queue.
-	 * 2. When flush request is running in non-queueable queue, the
-	 * queue is hold. Restart the queue after flush request is finished
-	 * to avoid stall.
-	 * This function is called from request completion path and calling
-	 * directly into request_fn may confuse the driver.  Always use
-	 * kblockd.
-	 */
-	if (queued || fq->flush_queue_delayed) {
-		WARN_ON(q->mq_ops);
-		blk_run_queue_async(q);
-	}
 	fq->flush_queue_delayed = 0;
-	if (q->mq_ops)
-		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+	spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 }
 
 /**
@@ -301,6 +268,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	struct request *first_rq =
 		list_first_entry(pending, struct request, flush.list);
 	struct request *flush_rq = fq->flush_rq;
+	struct blk_mq_hw_ctx *hctx;
 
 	/* C1 described at the top of this file */
 	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
@@ -334,19 +302,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * In case of IO scheduler, flush rq need to borrow scheduler tag
 	 * just for cheating put/get driver tag.
 	 */
-	if (q->mq_ops) {
-		struct blk_mq_hw_ctx *hctx;
+	flush_rq->mq_ctx = first_rq->mq_ctx;
 
-		flush_rq->mq_ctx = first_rq->mq_ctx;
-
-		if (!q->elevator) {
-			fq->orig_rq = first_rq;
-			flush_rq->tag = first_rq->tag;
-			hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
-			blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
-		} else {
-			flush_rq->internal_tag = first_rq->internal_tag;
-		}
+	if (!q->elevator) {
+		fq->orig_rq = first_rq;
+		flush_rq->tag = first_rq->tag;
+		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+	} else {
+		flush_rq->internal_tag = first_rq->internal_tag;
 	}
 
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
@@ -358,49 +322,6 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	return blk_flush_queue_rq(flush_rq, false);
 }
 
-static void flush_data_end_io(struct request *rq, blk_status_t error)
-{
-	struct request_queue *q = rq->q;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-
-	lockdep_assert_held(q->queue_lock);
-
-	/*
-	 * Updating q->in_flight[] here for making this tag usable
-	 * early. Because in blk_queue_start_tag(),
-	 * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and
-	 * reserve tags for sync I/O.
-	 *
-	 * More importantly this way can avoid the following I/O
-	 * deadlock:
-	 *
-	 * - suppose there are 40 fua requests comming to flush queue
-	 *   and queue depth is 31
-	 * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc
-	 *   tag for async I/O any more
-	 * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT
-	 *   and flush_data_end_io() is called
-	 * - the other rqs still can't go ahead if not updating
-	 *   q->in_flight[BLK_RW_ASYNC] here, meantime these rqs
-	 *   are held in flush data queue and make no progress of
-	 *   handling post flush rq
-	 * - only after the post flush rq is handled, all these rqs
-	 *   can be completed
-	 */
-
-	elv_completed_request(q, rq);
-
-	/* for avoiding double accounting */
-	rq->rq_flags &= ~RQF_STARTED;
-
-	/*
-	 * After populating an empty queue, kick it to avoid stall.  Read
-	 * the comment in flush_end_io().
-	 */
-	if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error))
-		blk_run_queue_async(q);
-}
-
 static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
@@ -443,9 +364,6 @@ void blk_insert_flush(struct request *rq)
 	unsigned int policy = blk_flush_policy(fflags, rq);
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
 
-	if (!q->mq_ops)
-		lockdep_assert_held(q->queue_lock);
-
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_PREFLUSH and FUA for the driver.
@@ -468,10 +386,7 @@ void blk_insert_flush(struct request *rq)
 	 * complete the request.
 	 */
 	if (!policy) {
-		if (q->mq_ops)
-			blk_mq_end_request(rq, 0);
-		else
-			__blk_end_request(rq, 0, 0);
+		blk_mq_end_request(rq, 0);
 		return;
 	}
 
@@ -484,10 +399,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		if (q->mq_ops)
-			blk_mq_request_bypass_insert(rq, false);
-		else
-			list_add_tail(&rq->queuelist, &q->queue_head);
+		blk_mq_request_bypass_insert(rq, false);
 		return;
 	}
 
@@ -499,17 +411,12 @@ void blk_insert_flush(struct request *rq)
 	INIT_LIST_HEAD(&rq->flush.list);
 	rq->rq_flags |= RQF_FLUSH_SEQ;
 	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
-	if (q->mq_ops) {
-		rq->end_io = mq_flush_data_end_io;
 
-		spin_lock_irq(&fq->mq_flush_lock);
-		blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
-		spin_unlock_irq(&fq->mq_flush_lock);
-		return;
-	}
-	rq->end_io = flush_data_end_io;
+	rq->end_io = mq_flush_data_end_io;
 
+	spin_lock_irq(&fq->mq_flush_lock);
 	blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+	spin_unlock_irq(&fq->mq_flush_lock);
 }
 
 /**
@@ -575,8 +482,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 	if (!fq)
 		goto fail;
 
-	if (q->mq_ops)
-		spin_lock_init(&fq->mq_flush_lock);
+	spin_lock_init(&fq->mq_flush_lock);
 
 	rq_sz = round_up(rq_sz + cmd_size, cache_line_size());
 	fq->flush_rq = kzalloc_node(rq_sz, flags, node);
diff --git a/block/blk.h b/block/blk.h
index a1841b8ff129..57a302bf5a70 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -114,9 +114,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 static inline struct blk_flush_queue *blk_get_flush_queue(
 		struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	if (q->mq_ops)
-		return blk_mq_map_queue(q, ctx->cpu)->fq;
-	return q->fq;
+	return blk_mq_map_queue(q, ctx->cpu)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)

From 404b8f5a03d840f74669fd55e26f8e3564cc2dd8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 12:43:24 -0600
Subject: [PATCH 025/351] block: cleanup kick/queued handling

Now that blk_flush_queue_rq() always returns false, we can
remove that return value. That bubbles through the stack,
allowing us to remove a bunch of state tracking around it.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 9baa9a119447..248fe78c2b9b 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -93,7 +93,7 @@ enum {
 	FLUSH_PENDING_TIMEOUT	= 5 * HZ,
 };
 
-static bool blk_kick_flush(struct request_queue *q,
+static void blk_kick_flush(struct request_queue *q,
 			   struct blk_flush_queue *fq, unsigned int flags);
 
 static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
@@ -132,10 +132,9 @@ static void blk_flush_restore_request(struct request *rq)
 	rq->end_io = rq->flush.saved_end_io;
 }
 
-static bool blk_flush_queue_rq(struct request *rq, bool add_front)
+static void blk_flush_queue_rq(struct request *rq, bool add_front)
 {
 	blk_mq_add_to_requeue_list(rq, add_front, true);
-	return false;
 }
 
 /**
@@ -154,13 +153,12 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front)
  * RETURNS:
  * %true if requests were added to the dispatch queue, %false otherwise.
  */
-static bool blk_flush_complete_seq(struct request *rq,
+static void blk_flush_complete_seq(struct request *rq,
 				   struct blk_flush_queue *fq,
 				   unsigned int seq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
-	bool queued = false, kicked;
 	unsigned int cmd_flags;
 
 	BUG_ON(rq->flush.seq & seq);
@@ -183,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq,
 
 	case REQ_FSEQ_DATA:
 		list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
-		queued = blk_flush_queue_rq(rq, true);
+		blk_flush_queue_rq(rq, true);
 		break;
 
 	case REQ_FSEQ_DONE:
@@ -203,15 +201,13 @@ static bool blk_flush_complete_seq(struct request *rq,
 		BUG();
 	}
 
-	kicked = blk_kick_flush(q, fq, cmd_flags);
-	return kicked | queued;
+	blk_kick_flush(q, fq, cmd_flags);
 }
 
 static void flush_end_io(struct request *flush_rq, blk_status_t error)
 {
 	struct request_queue *q = flush_rq->q;
 	struct list_head *running;
-	bool queued = false;
 	struct request *rq, *n;
 	unsigned long flags = 0;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
@@ -239,7 +235,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 		unsigned int seq = blk_flush_cur_seq(rq);
 
 		BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
-		queued |= blk_flush_complete_seq(rq, fq, seq, error);
+		blk_flush_complete_seq(rq, fq, seq, error);
 	}
 
 	fq->flush_queue_delayed = 0;
@@ -258,10 +254,8 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
  * CONTEXT:
  * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
  *
- * RETURNS:
- * %true if flush was issued, %false otherwise.
  */
-static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
+static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 			   unsigned int flags)
 {
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
@@ -272,7 +266,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 
 	/* C1 described at the top of this file */
 	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
-		return false;
+		return;
 
 	/* C2 and C3
 	 *
@@ -284,7 +278,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	    !(q->mq_ops && q->elevator) &&
 	    time_before(jiffies,
 			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
-		return false;
+		return;
 
 	/*
 	 * Issue flush and toggle pending_idx.  This makes pending_idx
@@ -319,7 +313,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 
-	return blk_flush_queue_rq(flush_rq, false);
+	blk_flush_queue_rq(flush_rq, false);
 }
 
 static void mq_flush_data_end_io(struct request *rq, blk_status_t error)

From f382fb0bcef4c37dc049e9f6963e3baf204d815c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Oct 2018 10:14:46 -0600
Subject: [PATCH 026/351] block: remove legacy IO schedulers

Retain the deadline documentation, as that carries over to mq-deadline
as well.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/block/cfq-iosched.txt |  291 --
 block/Kconfig.iosched               |   61 -
 block/Makefile                      |    3 -
 block/cfq-iosched.c                 | 4916 ---------------------------
 block/deadline-iosched.c            |  560 ---
 block/elevator.c                    |   70 -
 block/noop-iosched.c                |  124 -
 7 files changed, 6025 deletions(-)
 delete mode 100644 Documentation/block/cfq-iosched.txt
 delete mode 100644 block/cfq-iosched.c
 delete mode 100644 block/deadline-iosched.c
 delete mode 100644 block/noop-iosched.c

diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
deleted file mode 100644
index 895bd3813115..000000000000
--- a/Documentation/block/cfq-iosched.txt
+++ /dev/null
@@ -1,291 +0,0 @@
-CFQ (Complete Fairness Queueing)
-===============================
-
-The main aim of CFQ scheduler is to provide a fair allocation of the disk
-I/O bandwidth for all the processes which requests an I/O operation.
-
-CFQ maintains the per process queue for the processes which request I/O
-operation(synchronous requests). In case of asynchronous requests, all the
-requests from all the processes are batched together according to their
-process's I/O priority.
-
-CFQ ioscheduler tunables
-========================
-
-slice_idle
-----------
-This specifies how long CFQ should idle for next request on certain cfq queues
-(for sequential workloads) and service trees (for random workloads) before
-queue is expired and CFQ selects next queue to dispatch from.
-
-By default slice_idle is a non-zero value. That means by default we idle on
-queues/service trees. This can be very helpful on highly seeky media like
-single spindle SATA/SAS disks where we can cut down on overall number of
-seeks and see improved throughput.
-
-Setting slice_idle to 0 will remove all the idling on queues/service tree
-level and one should see an overall improved throughput on faster storage
-devices like multiple SATA/SAS disks in hardware RAID configuration. The down
-side is that isolation provided from WRITES also goes down and notion of
-IO priority becomes weaker.
-
-So depending on storage and workload, it might be useful to set slice_idle=0.
-In general I think for SATA/SAS disks and software RAID of SATA/SAS disks
-keeping slice_idle enabled should be useful. For any configurations where
-there are multiple spindles behind single LUN (Host based hardware RAID
-controller or for storage arrays), setting slice_idle=0 might end up in better
-throughput and acceptable latencies.
-
-back_seek_max
--------------
-This specifies, given in Kbytes, the maximum "distance" for backward seeking.
-The distance is the amount of space from the current head location to the
-sectors that are backward in terms of distance.
-
-This parameter allows the scheduler to anticipate requests in the "backward"
-direction and consider them as being the "next" if they are within this
-distance from the current head location.
-
-back_seek_penalty
------------------
-This parameter is used to compute the cost of backward seeking. If the
-backward distance of request is just 1/back_seek_penalty from a "front"
-request, then the seeking cost of two requests is considered equivalent.
-
-So scheduler will not bias toward one or the other request (otherwise scheduler
-will bias toward front request). Default value of back_seek_penalty is 2.
-
-fifo_expire_async
------------------
-This parameter is used to set the timeout of asynchronous requests. Default
-value of this is 248ms.
-
-fifo_expire_sync
-----------------
-This parameter is used to set the timeout of synchronous requests. Default
-value of this is 124ms. In case to favor synchronous requests over asynchronous
-one, this value should be decreased relative to fifo_expire_async.
-
-group_idle
------------
-This parameter forces idling at the CFQ group level instead of CFQ
-queue level. This was introduced after a bottleneck was observed
-in higher end storage due to idle on sequential queue and allow dispatch
-from a single queue. The idea with this parameter is that it can be run with
-slice_idle=0 and group_idle=8, so that idling does not happen on individual
-queues in the group but happens overall on the group and thus still keeps the
-IO controller working.
-Not idling on individual queues in the group will dispatch requests from
-multiple queues in the group at the same time and achieve higher throughput
-on higher end storage.
-
-Default value for this parameter is 8ms.
-
-low_latency
------------
-This parameter is used to enable/disable the low latency mode of the CFQ
-scheduler. If enabled, CFQ tries to recompute the slice time for each process
-based on the target_latency set for the system. This favors fairness over
-throughput. Disabling low latency (setting it to 0) ignores target latency,
-allowing each process in the system to get a full time slice.
-
-By default low latency mode is enabled.
-
-target_latency
---------------
-This parameter is used to calculate the time slice for a process if cfq's
-latency mode is enabled. It will ensure that sync requests have an estimated
-latency. But if sequential workload is higher(e.g. sequential read),
-then to meet the latency constraints, throughput may decrease because of less
-time for each process to issue I/O request before the cfq queue is switched.
-
-Though this can be overcome by disabling the latency_mode, it may increase
-the read latency for some applications. This parameter allows for changing
-target_latency through the sysfs interface which can provide the balanced
-throughput and read latency.
-
-Default value for target_latency is 300ms.
-
-slice_async
------------
-This parameter is same as of slice_sync but for asynchronous queue. The
-default value is 40ms.
-
-slice_async_rq
---------------
-This parameter is used to limit the dispatching of asynchronous request to
-device request queue in queue's slice time. The maximum number of request that
-are allowed to be dispatched also depends upon the io priority. Default value
-for this is 2.
-
-slice_sync
-----------
-When a queue is selected for execution, the queues IO requests are only
-executed for a certain amount of time(time_slice) before switching to another
-queue. This parameter is used to calculate the time slice of synchronous
-queue.
-
-time_slice is computed using the below equation:-
-time_slice = slice_sync + (slice_sync/5 * (4 - prio)). To increase the
-time_slice of synchronous queue, increase the value of slice_sync. Default
-value is 100ms.
-
-quantum
--------
-This specifies the number of request dispatched to the device queue. In a
-queue's time slice, a request will not be dispatched if the number of request
-in the device exceeds this parameter. This parameter is used for synchronous
-request.
-
-In case of storage with several disk, this setting can limit the parallel
-processing of request. Therefore, increasing the value can improve the
-performance although this can cause the latency of some I/O to increase due
-to more number of requests.
-
-CFQ Group scheduling
-====================
-
-CFQ supports blkio cgroup and has "blkio." prefixed files in each
-blkio cgroup directory. It is weight-based and there are four knobs
-for configuration - weight[_device] and leaf_weight[_device].
-Internal cgroup nodes (the ones with children) can also have tasks in
-them, so the former two configure how much proportion the cgroup as a
-whole is entitled to at its parent's level while the latter two
-configure how much proportion the tasks in the cgroup have compared to
-its direct children.
-
-Another way to think about it is assuming that each internal node has
-an implicit leaf child node which hosts all the tasks whose weight is
-configured by leaf_weight[_device]. Let's assume a blkio hierarchy
-composed of five cgroups - root, A, B, AA and AB - with the following
-weights where the names represent the hierarchy.
-
-        weight leaf_weight
- root :  125    125
- A    :  500    750
- B    :  250    500
- AA   :  500    500
- AB   : 1000    500
-
-root never has a parent making its weight is meaningless. For backward
-compatibility, weight is always kept in sync with leaf_weight. B, AA
-and AB have no child and thus its tasks have no children cgroup to
-compete with. They always get 100% of what the cgroup won at the
-parent level. Considering only the weights which matter, the hierarchy
-looks like the following.
-
-          root
-       /    |   \
-      A     B    leaf
-     500   250   125
-   /  |  \
-  AA  AB  leaf
- 500 1000 750
-
-If all cgroups have active IOs and competing with each other, disk
-time will be distributed like the following.
-
-Distribution below root. The total active weight at this level is
-A:500 + B:250 + C:125 = 875.
-
- root-leaf :   125 /  875      =~ 14%
- A         :   500 /  875      =~ 57%
- B(-leaf)  :   250 /  875      =~ 28%
-
-A has children and further distributes its 57% among the children and
-the implicit leaf node. The total active weight at this level is
-AA:500 + AB:1000 + A-leaf:750 = 2250.
-
- A-leaf    : ( 750 / 2250) * A =~ 19%
- AA(-leaf) : ( 500 / 2250) * A =~ 12%
- AB(-leaf) : (1000 / 2250) * A =~ 25%
-
-CFQ IOPS Mode for group scheduling
-===================================
-Basic CFQ design is to provide priority based time slices. Higher priority
-process gets bigger time slice and lower priority process gets smaller time
-slice. Measuring time becomes harder if storage is fast and supports NCQ and
-it would be better to dispatch multiple requests from multiple cfq queues in
-request queue at a time. In such scenario, it is not possible to measure time
-consumed by single queue accurately.
-
-What is possible though is to measure number of requests dispatched from a
-single queue and also allow dispatch from multiple cfq queue at the same time.
-This effectively becomes the fairness in terms of IOPS (IO operations per
-second).
-
-If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
-to IOPS mode and starts providing fairness in terms of number of requests
-dispatched. Note that this mode switching takes effect only for group
-scheduling. For non-cgroup users nothing should change.
-
-CFQ IO scheduler Idling Theory
-===============================
-Idling on a queue is primarily about waiting for the next request to come
-on same queue after completion of a request. In this process CFQ will not
-dispatch requests from other cfq queues even if requests are pending there.
-
-The rationale behind idling is that it can cut down on number of seeks
-on rotational media. For example, if a process is doing dependent
-sequential reads (next read will come on only after completion of previous
-one), then not dispatching request from other queue should help as we
-did not move the disk head and kept on dispatching sequential IO from
-one queue.
-
-CFQ has following service trees and various queues are put on these trees.
-
-	sync-idle	sync-noidle	async
-
-All cfq queues doing synchronous sequential IO go on to sync-idle tree.
-On this tree we idle on each queue individually.
-
-All synchronous non-sequential queues go on sync-noidle tree. Also any
-synchronous write request which is not marked with REQ_IDLE goes on this
-service tree. On this tree we do not idle on individual queues instead idle
-on the whole group of queues or the tree. So if there are 4 queues waiting
-for IO to dispatch we will idle only once last queue has dispatched the IO
-and there is no more IO on this service tree.
-
-All async writes go on async service tree. There is no idling on async
-queues.
-
-CFQ has some optimizations for SSDs and if it detects a non-rotational
-media which can support higher queue depth (multiple requests at in
-flight at a time), then it cuts down on idling of individual queues and
-all the queues move to sync-noidle tree and only tree idle remains. This
-tree idling provides isolation with buffered write queues on async tree.
-
-FAQ
-===
-Q1. Why to idle at all on queues not marked with REQ_IDLE.
-
-A1. We only do tree idle (all queues on sync-noidle tree) on queues not marked
-    with REQ_IDLE. This helps in providing isolation with all the sync-idle
-    queues. Otherwise in presence of many sequential readers, other
-    synchronous IO might not get fair share of disk.
-
-    For example, if there are 10 sequential readers doing IO and they get
-    100ms each. If a !REQ_IDLE request comes in, it will be scheduled
-    roughly after 1 second. If after completion of !REQ_IDLE request we
-    do not idle, and after a couple of milli seconds a another !REQ_IDLE
-    request comes in, again it will be scheduled after 1second. Repeat it
-    and notice how a workload can lose its disk share and suffer due to
-    multiple sequential readers.
-
-    fsync can generate dependent IO where bunch of data is written in the
-    context of fsync, and later some journaling data is written. Journaling
-    data comes in only after fsync has finished its IO (atleast for ext4
-    that seemed to be the case). Now if one decides not to idle on fsync
-    thread due to !REQ_IDLE, then next journaling write will not get
-    scheduled for another second. A process doing small fsync, will suffer
-    badly in presence of multiple sequential readers.
-
-    Hence doing tree idling on threads using !REQ_IDLE flag on requests
-    provides isolation from multiple sequential readers and at the same
-    time we do not idle on individual threads.
-
-Q2. When to specify REQ_IDLE
-A2. I would think whenever one is doing synchronous write and expecting
-    more writes to be dispatched from same context soon, should be able
-    to specify REQ_IDLE on writes and that probably should work well for
-    most of the cases.
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index f95a48b0d7b2..4626b88b2d5a 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -3,67 +3,6 @@ if BLOCK
 
 menu "IO Schedulers"
 
-config IOSCHED_NOOP
-	bool
-	default y
-	---help---
-	  The no-op I/O scheduler is a minimal scheduler that does basic merging
-	  and sorting. Its main uses include non-disk based block devices like
-	  memory devices, and specialised software or hardware environments
-	  that do their own scheduling and require only minimal assistance from
-	  the kernel.
-
-config IOSCHED_DEADLINE
-	tristate "Deadline I/O scheduler"
-	default y
-	---help---
-	  The deadline I/O scheduler is simple and compact. It will provide
-	  CSCAN service with FIFO expiration of requests, switching to
-	  a new point in the service tree and doing a batch of IO from there
-	  in case of expiry.
-
-config IOSCHED_CFQ
-	tristate "CFQ I/O scheduler"
-	default y
-	---help---
-	  The CFQ I/O scheduler tries to distribute bandwidth equally
-	  among all processes in the system. It should provide a fair
-	  and low latency working environment, suitable for both desktop
-	  and server systems.
-
-	  This is the default I/O scheduler.
-
-config CFQ_GROUP_IOSCHED
-	bool "CFQ Group Scheduling support"
-	depends on IOSCHED_CFQ && BLK_CGROUP
-	---help---
-	  Enable group IO scheduling in CFQ.
-
-choice
-
-	prompt "Default I/O scheduler"
-	default DEFAULT_CFQ
-	help
-	  Select the I/O scheduler which will be used by default for all
-	  block devices.
-
-	config DEFAULT_DEADLINE
-		bool "Deadline" if IOSCHED_DEADLINE=y
-
-	config DEFAULT_CFQ
-		bool "CFQ" if IOSCHED_CFQ=y
-
-	config DEFAULT_NOOP
-		bool "No-op"
-
-endchoice
-
-config DEFAULT_IOSCHED
-	string
-	default "deadline" if DEFAULT_DEADLINE
-	default "cfq" if DEFAULT_CFQ
-	default "noop" if DEFAULT_NOOP
-
 config MQ_IOSCHED_DEADLINE
 	tristate "MQ deadline I/O scheduler"
 	default y
diff --git a/block/Makefile b/block/Makefile
index 213674c8faaa..eee1b4ceecf9 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_BLK_CGROUP_IOLATENCY)	+= blk-iolatency.o
-obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
-obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
-obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 obj-$(CONFIG_MQ_IOSCHED_KYBER)	+= kyber-iosched.o
 bfq-y				:= bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
deleted file mode 100644
index ed41aa978c4a..000000000000
--- a/block/cfq-iosched.c
+++ /dev/null
@@ -1,4916 +0,0 @@
-/*
- *  CFQ, or complete fairness queueing, disk scheduler.
- *
- *  Based on ideas from a previously unfinished io
- *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
- *
- *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sched/clock.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/ktime.h>
-#include <linux/rbtree.h>
-#include <linux/ioprio.h>
-#include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
-#include "blk.h"
-#include "blk-wbt.h"
-
-/*
- * tunables
- */
-/* max queue in one round of service */
-static const int cfq_quantum = 8;
-static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
-/* maximum backwards seek, in KiB */
-static const int cfq_back_max = 16 * 1024;
-/* penalty of a backwards seek */
-static const int cfq_back_penalty = 2;
-static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
-static u64 cfq_slice_async = NSEC_PER_SEC / 25;
-static const int cfq_slice_async_rq = 2;
-static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
-static u64 cfq_group_idle = NSEC_PER_SEC / 125;
-static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
-static const int cfq_hist_divisor = 4;
-
-/*
- * offset from end of queue service tree for idle class
- */
-#define CFQ_IDLE_DELAY		(NSEC_PER_SEC / 5)
-/* offset from end of group service tree under time slice mode */
-#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
-/* offset from end of group service under IOPS mode */
-#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
-
-/*
- * below this threshold, we consider thinktime immediate
- */
-#define CFQ_MIN_TT		(2 * NSEC_PER_SEC / HZ)
-
-#define CFQ_SLICE_SCALE		(5)
-#define CFQ_HW_QUEUE_MIN	(5)
-#define CFQ_SERVICE_SHIFT       12
-
-#define CFQQ_SEEK_THR		(sector_t)(8 * 100)
-#define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
-#define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
-#define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
-
-#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
-#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
-#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
-
-static struct kmem_cache *cfq_pool;
-
-#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
-#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
-
-#define sample_valid(samples)	((samples) > 80)
-#define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
-
-/* blkio-related constants */
-#define CFQ_WEIGHT_LEGACY_MIN	10
-#define CFQ_WEIGHT_LEGACY_DFL	500
-#define CFQ_WEIGHT_LEGACY_MAX	1000
-
-struct cfq_ttime {
-	u64 last_end_request;
-
-	u64 ttime_total;
-	u64 ttime_mean;
-	unsigned long ttime_samples;
-};
-
-/*
- * Most of our rbtree usage is for sorting with min extraction, so
- * if we cache the leftmost node we don't have to walk down the tree
- * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
- * move this into the elevator for the rq sorting as well.
- */
-struct cfq_rb_root {
-	struct rb_root_cached rb;
-	struct rb_node *rb_rightmost;
-	unsigned count;
-	u64 min_vdisktime;
-	struct cfq_ttime ttime;
-};
-#define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
-			.rb_rightmost = NULL,			     \
-			.ttime = {.last_end_request = ktime_get_ns(),},}
-
-/*
- * Per process-grouping structure
- */
-struct cfq_queue {
-	/* reference count */
-	int ref;
-	/* various state flags, see below */
-	unsigned int flags;
-	/* parent cfq_data */
-	struct cfq_data *cfqd;
-	/* service_tree member */
-	struct rb_node rb_node;
-	/* service_tree key */
-	u64 rb_key;
-	/* prio tree member */
-	struct rb_node p_node;
-	/* prio tree root we belong to, if any */
-	struct rb_root *p_root;
-	/* sorted list of pending requests */
-	struct rb_root sort_list;
-	/* if fifo isn't expired, next request to serve */
-	struct request *next_rq;
-	/* requests queued in sort_list */
-	int queued[2];
-	/* currently allocated requests */
-	int allocated[2];
-	/* fifo list of requests in sort_list */
-	struct list_head fifo;
-
-	/* time when queue got scheduled in to dispatch first request. */
-	u64 dispatch_start;
-	u64 allocated_slice;
-	u64 slice_dispatch;
-	/* time when first request from queue completed and slice started. */
-	u64 slice_start;
-	u64 slice_end;
-	s64 slice_resid;
-
-	/* pending priority requests */
-	int prio_pending;
-	/* number of requests that are on the dispatch list or inside driver */
-	int dispatched;
-
-	/* io prio of this group */
-	unsigned short ioprio, org_ioprio;
-	unsigned short ioprio_class, org_ioprio_class;
-
-	pid_t pid;
-
-	u32 seek_history;
-	sector_t last_request_pos;
-
-	struct cfq_rb_root *service_tree;
-	struct cfq_queue *new_cfqq;
-	struct cfq_group *cfqg;
-	/* Number of sectors dispatched from queue in single dispatch round */
-	unsigned long nr_sectors;
-};
-
-/*
- * First index in the service_trees.
- * IDLE is handled separately, so it has negative index
- */
-enum wl_class_t {
-	BE_WORKLOAD = 0,
-	RT_WORKLOAD = 1,
-	IDLE_WORKLOAD = 2,
-	CFQ_PRIO_NR,
-};
-
-/*
- * Second index in the service_trees.
- */
-enum wl_type_t {
-	ASYNC_WORKLOAD = 0,
-	SYNC_NOIDLE_WORKLOAD = 1,
-	SYNC_WORKLOAD = 2
-};
-
-struct cfqg_stats {
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	/* number of ios merged */
-	struct blkg_rwstat		merged;
-	/* total time spent on device in ns, may not be accurate w/ queueing */
-	struct blkg_rwstat		service_time;
-	/* total time spent waiting in scheduler queue in ns */
-	struct blkg_rwstat		wait_time;
-	/* number of IOs queued up */
-	struct blkg_rwstat		queued;
-	/* total disk time and nr sectors dispatched by this group */
-	struct blkg_stat		time;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	/* time not charged to this cgroup */
-	struct blkg_stat		unaccounted_time;
-	/* sum of number of ios queued across all samples */
-	struct blkg_stat		avg_queue_size_sum;
-	/* count of samples taken for average */
-	struct blkg_stat		avg_queue_size_samples;
-	/* how many times this group has been removed from service tree */
-	struct blkg_stat		dequeue;
-	/* total time spent waiting for it to be assigned a timeslice. */
-	struct blkg_stat		group_wait_time;
-	/* time spent idling for this blkcg_gq */
-	struct blkg_stat		idle_time;
-	/* total time with empty current active q with other requests queued */
-	struct blkg_stat		empty_time;
-	/* fields after this shouldn't be cleared on stat reset */
-	u64				start_group_wait_time;
-	u64				start_idle_time;
-	u64				start_empty_time;
-	uint16_t			flags;
-#endif	/* CONFIG_DEBUG_BLK_CGROUP */
-#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
-};
-
-/* Per-cgroup data */
-struct cfq_group_data {
-	/* must be the first member */
-	struct blkcg_policy_data cpd;
-
-	unsigned int weight;
-	unsigned int leaf_weight;
-};
-
-/* This is per cgroup per device grouping structure */
-struct cfq_group {
-	/* must be the first member */
-	struct blkg_policy_data pd;
-
-	/* group service_tree member */
-	struct rb_node rb_node;
-
-	/* group service_tree key */
-	u64 vdisktime;
-
-	/*
-	 * The number of active cfqgs and sum of their weights under this
-	 * cfqg.  This covers this cfqg's leaf_weight and all children's
-	 * weights, but does not cover weights of further descendants.
-	 *
-	 * If a cfqg is on the service tree, it's active.  An active cfqg
-	 * also activates its parent and contributes to the children_weight
-	 * of the parent.
-	 */
-	int nr_active;
-	unsigned int children_weight;
-
-	/*
-	 * vfraction is the fraction of vdisktime that the tasks in this
-	 * cfqg are entitled to.  This is determined by compounding the
-	 * ratios walking up from this cfqg to the root.
-	 *
-	 * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
-	 * vfractions on a service tree is approximately 1.  The sum may
-	 * deviate a bit due to rounding errors and fluctuations caused by
-	 * cfqgs entering and leaving the service tree.
-	 */
-	unsigned int vfraction;
-
-	/*
-	 * There are two weights - (internal) weight is the weight of this
-	 * cfqg against the sibling cfqgs.  leaf_weight is the wight of
-	 * this cfqg against the child cfqgs.  For the root cfqg, both
-	 * weights are kept in sync for backward compatibility.
-	 */
-	unsigned int weight;
-	unsigned int new_weight;
-	unsigned int dev_weight;
-
-	unsigned int leaf_weight;
-	unsigned int new_leaf_weight;
-	unsigned int dev_leaf_weight;
-
-	/* number of cfqq currently on this group */
-	int nr_cfqq;
-
-	/*
-	 * Per group busy queues average. Useful for workload slice calc. We
-	 * create the array for each prio class but at run time it is used
-	 * only for RT and BE class and slot for IDLE class remains unused.
-	 * This is primarily done to avoid confusion and a gcc warning.
-	 */
-	unsigned int busy_queues_avg[CFQ_PRIO_NR];
-	/*
-	 * rr lists of queues with requests. We maintain service trees for
-	 * RT and BE classes. These trees are subdivided in subclasses
-	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
-	 * class there is no subclassification and all the cfq queues go on
-	 * a single tree service_tree_idle.
-	 * Counts are embedded in the cfq_rb_root
-	 */
-	struct cfq_rb_root service_trees[2][3];
-	struct cfq_rb_root service_tree_idle;
-
-	u64 saved_wl_slice;
-	enum wl_type_t saved_wl_type;
-	enum wl_class_t saved_wl_class;
-
-	/* number of requests that are on the dispatch list or inside driver */
-	int dispatched;
-	struct cfq_ttime ttime;
-	struct cfqg_stats stats;	/* stats for this cfqg */
-
-	/* async queue for each priority case */
-	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
-	struct cfq_queue *async_idle_cfqq;
-
-};
-
-struct cfq_io_cq {
-	struct io_cq		icq;		/* must be the first member */
-	struct cfq_queue	*cfqq[2];
-	struct cfq_ttime	ttime;
-	int			ioprio;		/* the current ioprio */
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	uint64_t		blkcg_serial_nr; /* the current blkcg serial */
-#endif
-};
-
-/*
- * Per block device queue structure
- */
-struct cfq_data {
-	struct request_queue *queue;
-	/* Root service tree for cfq_groups */
-	struct cfq_rb_root grp_service_tree;
-	struct cfq_group *root_group;
-
-	/*
-	 * The priority currently being served
-	 */
-	enum wl_class_t serving_wl_class;
-	enum wl_type_t serving_wl_type;
-	u64 workload_expires;
-	struct cfq_group *serving_group;
-
-	/*
-	 * Each priority tree is sorted by next_request position.  These
-	 * trees are used when determining if two or more queues are
-	 * interleaving requests (see cfq_close_cooperator).
-	 */
-	struct rb_root prio_trees[CFQ_PRIO_LISTS];
-
-	unsigned int busy_queues;
-	unsigned int busy_sync_queues;
-
-	int rq_in_driver;
-	int rq_in_flight[2];
-
-	/*
-	 * queue-depth detection
-	 */
-	int rq_queued;
-	int hw_tag;
-	/*
-	 * hw_tag can be
-	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
-	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
-	 *  0 => no NCQ
-	 */
-	int hw_tag_est_depth;
-	unsigned int hw_tag_samples;
-
-	/*
-	 * idle window management
-	 */
-	struct hrtimer idle_slice_timer;
-	struct work_struct unplug_work;
-
-	struct cfq_queue *active_queue;
-	struct cfq_io_cq *active_cic;
-
-	sector_t last_position;
-
-	/*
-	 * tunables, see top of file
-	 */
-	unsigned int cfq_quantum;
-	unsigned int cfq_back_penalty;
-	unsigned int cfq_back_max;
-	unsigned int cfq_slice_async_rq;
-	unsigned int cfq_latency;
-	u64 cfq_fifo_expire[2];
-	u64 cfq_slice[2];
-	u64 cfq_slice_idle;
-	u64 cfq_group_idle;
-	u64 cfq_target_latency;
-
-	/*
-	 * Fallback dummy cfqq for extreme OOM conditions
-	 */
-	struct cfq_queue oom_cfqq;
-
-	u64 last_delayed_sync;
-};
-
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
-static void cfq_put_queue(struct cfq_queue *cfqq);
-
-static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
-					    enum wl_class_t class,
-					    enum wl_type_t type)
-{
-	if (!cfqg)
-		return NULL;
-
-	if (class == IDLE_WORKLOAD)
-		return &cfqg->service_tree_idle;
-
-	return &cfqg->service_trees[class][type];
-}
-
-enum cfqq_state_flags {
-	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
-	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
-	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
-	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
-	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
-	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
-	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
-	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
-	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
-	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
-	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
-	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
-	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
-};
-
-#define CFQ_CFQQ_FNS(name)						\
-static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
-{									\
-	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
-}									\
-static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
-{									\
-	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
-}									\
-static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
-{									\
-	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
-}
-
-CFQ_CFQQ_FNS(on_rr);
-CFQ_CFQQ_FNS(wait_request);
-CFQ_CFQQ_FNS(must_dispatch);
-CFQ_CFQQ_FNS(must_alloc_slice);
-CFQ_CFQQ_FNS(fifo_expire);
-CFQ_CFQQ_FNS(idle_window);
-CFQ_CFQQ_FNS(prio_changed);
-CFQ_CFQQ_FNS(slice_new);
-CFQ_CFQQ_FNS(sync);
-CFQ_CFQQ_FNS(coop);
-CFQ_CFQQ_FNS(split_coop);
-CFQ_CFQQ_FNS(deep);
-CFQ_CFQQ_FNS(wait_busy);
-#undef CFQ_CFQQ_FNS
-
-#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
-
-/* cfqg stats flags */
-enum cfqg_stats_flags {
-	CFQG_stats_waiting = 0,
-	CFQG_stats_idling,
-	CFQG_stats_empty,
-};
-
-#define CFQG_FLAG_FNS(name)						\
-static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
-{									\
-	stats->flags |= (1 << CFQG_stats_##name);			\
-}									\
-static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
-{									\
-	stats->flags &= ~(1 << CFQG_stats_##name);			\
-}									\
-static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
-{									\
-	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
-}									\
-
-CFQG_FLAG_FNS(waiting)
-CFQG_FLAG_FNS(idling)
-CFQG_FLAG_FNS(empty)
-#undef CFQG_FLAG_FNS
-
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
-{
-	u64 now;
-
-	if (!cfqg_stats_waiting(stats))
-		return;
-
-	now = ktime_get_ns();
-	if (now > stats->start_group_wait_time)
-		blkg_stat_add(&stats->group_wait_time,
-			      now - stats->start_group_wait_time);
-	cfqg_stats_clear_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
-						 struct cfq_group *curr_cfqg)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-
-	if (cfqg_stats_waiting(stats))
-		return;
-	if (cfqg == curr_cfqg)
-		return;
-	stats->start_group_wait_time = ktime_get_ns();
-	cfqg_stats_mark_waiting(stats);
-}
-
-/* This should be called with the queue_lock held. */
-static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
-{
-	u64 now;
-
-	if (!cfqg_stats_empty(stats))
-		return;
-
-	now = ktime_get_ns();
-	if (now > stats->start_empty_time)
-		blkg_stat_add(&stats->empty_time,
-			      now - stats->start_empty_time);
-	cfqg_stats_clear_empty(stats);
-}
-
-static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
-{
-	blkg_stat_add(&cfqg->stats.dequeue, 1);
-}
-
-static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-
-	if (blkg_rwstat_total(&stats->queued))
-		return;
-
-	/*
-	 * group is already marked empty. This can happen if cfqq got new
-	 * request in parent group and moved to this group while being added
-	 * to service tree. Just ignore the event and move on.
-	 */
-	if (cfqg_stats_empty(stats))
-		return;
-
-	stats->start_empty_time = ktime_get_ns();
-	cfqg_stats_mark_empty(stats);
-}
-
-static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-
-	if (cfqg_stats_idling(stats)) {
-		u64 now = ktime_get_ns();
-
-		if (now > stats->start_idle_time)
-			blkg_stat_add(&stats->idle_time,
-				      now - stats->start_idle_time);
-		cfqg_stats_clear_idling(stats);
-	}
-}
-
-static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-
-	BUG_ON(cfqg_stats_idling(stats));
-
-	stats->start_idle_time = ktime_get_ns();
-	cfqg_stats_mark_idling(stats);
-}
-
-static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-
-	blkg_stat_add(&stats->avg_queue_size_sum,
-		      blkg_rwstat_total(&stats->queued));
-	blkg_stat_add(&stats->avg_queue_size_samples, 1);
-	cfqg_stats_update_group_wait_time(stats);
-}
-
-#else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
-
-static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
-static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
-static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
-static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
-
-#endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-
-static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
-{
-	return pd ? container_of(pd, struct cfq_group, pd) : NULL;
-}
-
-static struct cfq_group_data
-*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
-{
-	return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL;
-}
-
-static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
-{
-	return pd_to_blkg(&cfqg->pd);
-}
-
-static struct blkcg_policy blkcg_policy_cfq;
-
-static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
-{
-	return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
-}
-
-static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
-{
-	return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
-}
-
-static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
-{
-	struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
-
-	return pblkg ? blkg_to_cfqg(pblkg) : NULL;
-}
-
-static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
-				      struct cfq_group *ancestor)
-{
-	return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup,
-				    cfqg_to_blkg(ancestor)->blkcg->css.cgroup);
-}
-
-static inline void cfqg_get(struct cfq_group *cfqg)
-{
-	return blkg_get(cfqg_to_blkg(cfqg));
-}
-
-static inline void cfqg_put(struct cfq_group *cfqg)
-{
-	return blkg_put(cfqg_to_blkg(cfqg));
-}
-
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	do {			\
-	blk_add_cgroup_trace_msg((cfqd)->queue,				\
-			cfqg_to_blkg((cfqq)->cfqg)->blkcg,		\
-			"cfq%d%c%c " fmt, (cfqq)->pid,			\
-			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
-			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
-			  ##args);					\
-} while (0)
-
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)	do {			\
-	blk_add_cgroup_trace_msg((cfqd)->queue,				\
-			cfqg_to_blkg(cfqg)->blkcg, fmt, ##args);	\
-} while (0)
-
-static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-					    struct cfq_group *curr_cfqg,
-					    unsigned int op)
-{
-	blkg_rwstat_add(&cfqg->stats.queued, op, 1);
-	cfqg_stats_end_empty_time(&cfqg->stats);
-	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
-}
-
-static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
-			uint64_t time, unsigned long unaccounted_time)
-{
-	blkg_stat_add(&cfqg->stats.time, time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
-#endif
-}
-
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
-					       unsigned int op)
-{
-	blkg_rwstat_add(&cfqg->stats.queued, op, -1);
-}
-
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
-					       unsigned int op)
-{
-	blkg_rwstat_add(&cfqg->stats.merged, op, 1);
-}
-
-static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-						u64 start_time_ns,
-						u64 io_start_time_ns,
-						unsigned int op)
-{
-	struct cfqg_stats *stats = &cfqg->stats;
-	u64 now = ktime_get_ns();
-
-	if (now > io_start_time_ns)
-		blkg_rwstat_add(&stats->service_time, op,
-				now - io_start_time_ns);
-	if (io_start_time_ns > start_time_ns)
-		blkg_rwstat_add(&stats->wait_time, op,
-				io_start_time_ns - start_time_ns);
-}
-
-/* @stats = 0 */
-static void cfqg_stats_reset(struct cfqg_stats *stats)
-{
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_reset(&stats->merged);
-	blkg_rwstat_reset(&stats->service_time);
-	blkg_rwstat_reset(&stats->wait_time);
-	blkg_stat_reset(&stats->time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_reset(&stats->unaccounted_time);
-	blkg_stat_reset(&stats->avg_queue_size_sum);
-	blkg_stat_reset(&stats->avg_queue_size_samples);
-	blkg_stat_reset(&stats->dequeue);
-	blkg_stat_reset(&stats->group_wait_time);
-	blkg_stat_reset(&stats->idle_time);
-	blkg_stat_reset(&stats->empty_time);
-#endif
-}
-
-/* @to += @from */
-static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from)
-{
-	/* queued stats shouldn't be cleared */
-	blkg_rwstat_add_aux(&to->merged, &from->merged);
-	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
-	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
-	blkg_stat_add_aux(&from->time, &from->time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time);
-	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
-	blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples);
-	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
-	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
-	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
-	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
-#endif
-}
-
-/*
- * Transfer @cfqg's stats to its parent's aux counts so that the ancestors'
- * recursive stats can still account for the amount used by this cfqg after
- * it's gone.
- */
-static void cfqg_stats_xfer_dead(struct cfq_group *cfqg)
-{
-	struct cfq_group *parent = cfqg_parent(cfqg);
-
-	lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock);
-
-	if (unlikely(!parent))
-		return;
-
-	cfqg_stats_add_aux(&parent->stats, &cfqg->stats);
-	cfqg_stats_reset(&cfqg->stats);
-}
-
-#else	/* CONFIG_CFQ_GROUP_IOSCHED */
-
-static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; }
-static inline bool cfqg_is_descendant(struct cfq_group *cfqg,
-				      struct cfq_group *ancestor)
-{
-	return true;
-}
-static inline void cfqg_get(struct cfq_group *cfqg) { }
-static inline void cfqg_put(struct cfq_group *cfqg) { }
-
-#define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
-	blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid,	\
-			cfq_cfqq_sync((cfqq)) ? 'S' : 'A',		\
-			cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
-				##args)
-#define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
-
-static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
-			struct cfq_group *curr_cfqg, unsigned int op) { }
-static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
-			uint64_t time, unsigned long unaccounted_time) { }
-static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg,
-			unsigned int op) { }
-static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg,
-			unsigned int op) { }
-static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
-						u64 start_time_ns,
-						u64 io_start_time_ns,
-						unsigned int op) { }
-
-#endif	/* CONFIG_CFQ_GROUP_IOSCHED */
-
-#define cfq_log(cfqd, fmt, args...)	\
-	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
-
-/* Traverses through cfq group service trees */
-#define for_each_cfqg_st(cfqg, i, j, st) \
-	for (i = 0; i <= IDLE_WORKLOAD; i++) \
-		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
-			: &cfqg->service_tree_idle; \
-			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
-			(i == IDLE_WORKLOAD && j == 0); \
-			j++, st = i < IDLE_WORKLOAD ? \
-			&cfqg->service_trees[i][j]: NULL) \
-
-static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
-	struct cfq_ttime *ttime, bool group_idle)
-{
-	u64 slice;
-	if (!sample_valid(ttime->ttime_samples))
-		return false;
-	if (group_idle)
-		slice = cfqd->cfq_group_idle;
-	else
-		slice = cfqd->cfq_slice_idle;
-	return ttime->ttime_mean > slice;
-}
-
-static inline bool iops_mode(struct cfq_data *cfqd)
-{
-	/*
-	 * If we are not idling on queues and it is a NCQ drive, parallel
-	 * execution of requests is on and measuring time is not possible
-	 * in most of the cases until and unless we drive shallower queue
-	 * depths and that becomes a performance bottleneck. In such cases
-	 * switch to start providing fairness in terms of number of IOs.
-	 */
-	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
-		return true;
-	else
-		return false;
-}
-
-static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
-{
-	if (cfq_class_idle(cfqq))
-		return IDLE_WORKLOAD;
-	if (cfq_class_rt(cfqq))
-		return RT_WORKLOAD;
-	return BE_WORKLOAD;
-}
-
-
-static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
-{
-	if (!cfq_cfqq_sync(cfqq))
-		return ASYNC_WORKLOAD;
-	if (!cfq_cfqq_idle_window(cfqq))
-		return SYNC_NOIDLE_WORKLOAD;
-	return SYNC_WORKLOAD;
-}
-
-static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
-					struct cfq_data *cfqd,
-					struct cfq_group *cfqg)
-{
-	if (wl_class == IDLE_WORKLOAD)
-		return cfqg->service_tree_idle.count;
-
-	return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
-		cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
-		cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
-}
-
-static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
-					struct cfq_group *cfqg)
-{
-	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
-		cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
-}
-
-static void cfq_dispatch_insert(struct request_queue *, struct request *);
-static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct cfq_io_cq *cic, struct bio *bio);
-
-static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
-{
-	/* cic->icq is the first member, %NULL will convert to %NULL */
-	return container_of(icq, struct cfq_io_cq, icq);
-}
-
-static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
-					       struct io_context *ioc)
-{
-	if (ioc)
-		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
-	return NULL;
-}
-
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
-{
-	return cic->cfqq[is_sync];
-}
-
-static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
-				bool is_sync)
-{
-	cic->cfqq[is_sync] = cfqq;
-}
-
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
-{
-	return cic->icq.q->elevator->elevator_data;
-}
-
-/*
- * scheduler run of queue, if there are requests pending and no one in the
- * driver that will restart queueing
- */
-static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
-{
-	if (cfqd->busy_queues) {
-		cfq_log(cfqd, "schedule dispatch");
-		kblockd_schedule_work(&cfqd->unplug_work);
-	}
-}
-
-/*
- * Scale schedule slice based on io priority. Use the sync time slice only
- * if a queue is marked sync and has sync io queued. A sync queue with async
- * io only, should not get full sync slice length.
- */
-static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
-				 unsigned short prio)
-{
-	u64 base_slice = cfqd->cfq_slice[sync];
-	u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
-
-	WARN_ON(prio >= IOPRIO_BE_NR);
-
-	return base_slice + (slice * (4 - prio));
-}
-
-static inline u64
-cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
-}
-
-/**
- * cfqg_scale_charge - scale disk time charge according to cfqg weight
- * @charge: disk time being charged
- * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
- *
- * Scale @charge according to @vfraction, which is in range (0, 1].  The
- * scaling is inversely proportional.
- *
- * scaled = charge / vfraction
- *
- * The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
- */
-static inline u64 cfqg_scale_charge(u64 charge,
-				    unsigned int vfraction)
-{
-	u64 c = charge << CFQ_SERVICE_SHIFT;	/* make it fixed point */
-
-	/* charge / vfraction */
-	c <<= CFQ_SERVICE_SHIFT;
-	return div_u64(c, vfraction);
-}
-
-static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
-{
-	s64 delta = (s64)(vdisktime - min_vdisktime);
-	if (delta > 0)
-		min_vdisktime = vdisktime;
-
-	return min_vdisktime;
-}
-
-static void update_min_vdisktime(struct cfq_rb_root *st)
-{
-	if (!RB_EMPTY_ROOT(&st->rb.rb_root)) {
-		struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost);
-
-		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
-						  cfqg->vdisktime);
-	}
-}
-
-/*
- * get averaged number of queues of RT/BE priority.
- * average is updated, with a formula that gives more weight to higher numbers,
- * to quickly follows sudden increases and decrease slowly
- */
-
-static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
-					struct cfq_group *cfqg, bool rt)
-{
-	unsigned min_q, max_q;
-	unsigned mult  = cfq_hist_divisor - 1;
-	unsigned round = cfq_hist_divisor / 2;
-	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
-
-	min_q = min(cfqg->busy_queues_avg[rt], busy);
-	max_q = max(cfqg->busy_queues_avg[rt], busy);
-	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
-		cfq_hist_divisor;
-	return cfqg->busy_queues_avg[rt];
-}
-
-static inline u64
-cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
-}
-
-static inline u64
-cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	u64 slice = cfq_prio_to_slice(cfqd, cfqq);
-	if (cfqd->cfq_latency) {
-		/*
-		 * interested queues (we consider only the ones with the same
-		 * priority class in the cfq group)
-		 */
-		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
-						cfq_class_rt(cfqq));
-		u64 sync_slice = cfqd->cfq_slice[1];
-		u64 expect_latency = sync_slice * iq;
-		u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
-
-		if (expect_latency > group_slice) {
-			u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
-			u64 low_slice;
-
-			/* scale low_slice according to IO priority
-			 * and sync vs async */
-			low_slice = div64_u64(base_low_slice*slice, sync_slice);
-			low_slice = min(slice, low_slice);
-			/* the adapted slice value is scaled to fit all iqs
-			 * into the target latency */
-			slice = div64_u64(slice*group_slice, expect_latency);
-			slice = max(slice, low_slice);
-		}
-	}
-	return slice;
-}
-
-static inline void
-cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
-	u64 now = ktime_get_ns();
-
-	cfqq->slice_start = now;
-	cfqq->slice_end = now + slice;
-	cfqq->allocated_slice = slice;
-	cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
-}
-
-/*
- * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
- * isn't valid until the first request from the dispatch is activated
- * and the slice time set.
- */
-static inline bool cfq_slice_used(struct cfq_queue *cfqq)
-{
-	if (cfq_cfqq_slice_new(cfqq))
-		return false;
-	if (ktime_get_ns() < cfqq->slice_end)
-		return false;
-
-	return true;
-}
-
-/*
- * Lifted from AS - choose which of rq1 and rq2 that is best served now.
- * We choose the request that is closest to the head right now. Distance
- * behind the head is penalized and only allowed to a certain extent.
- */
-static struct request *
-cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
-{
-	sector_t s1, s2, d1 = 0, d2 = 0;
-	unsigned long back_max;
-#define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
-#define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
-	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
-
-	if (rq1 == NULL || rq1 == rq2)
-		return rq2;
-	if (rq2 == NULL)
-		return rq1;
-
-	if (rq_is_sync(rq1) != rq_is_sync(rq2))
-		return rq_is_sync(rq1) ? rq1 : rq2;
-
-	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
-		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
-
-	s1 = blk_rq_pos(rq1);
-	s2 = blk_rq_pos(rq2);
-
-	/*
-	 * by definition, 1KiB is 2 sectors
-	 */
-	back_max = cfqd->cfq_back_max * 2;
-
-	/*
-	 * Strict one way elevator _except_ in the case where we allow
-	 * short backward seeks which are biased as twice the cost of a
-	 * similar forward seek.
-	 */
-	if (s1 >= last)
-		d1 = s1 - last;
-	else if (s1 + back_max >= last)
-		d1 = (last - s1) * cfqd->cfq_back_penalty;
-	else
-		wrap |= CFQ_RQ1_WRAP;
-
-	if (s2 >= last)
-		d2 = s2 - last;
-	else if (s2 + back_max >= last)
-		d2 = (last - s2) * cfqd->cfq_back_penalty;
-	else
-		wrap |= CFQ_RQ2_WRAP;
-
-	/* Found required data */
-
-	/*
-	 * By doing switch() on the bit mask "wrap" we avoid having to
-	 * check two variables for all permutations: --> faster!
-	 */
-	switch (wrap) {
-	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
-		if (d1 < d2)
-			return rq1;
-		else if (d2 < d1)
-			return rq2;
-		else {
-			if (s1 >= s2)
-				return rq1;
-			else
-				return rq2;
-		}
-
-	case CFQ_RQ2_WRAP:
-		return rq1;
-	case CFQ_RQ1_WRAP:
-		return rq2;
-	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
-	default:
-		/*
-		 * Since both rqs are wrapped,
-		 * start with the one that's further behind head
-		 * (--> only *one* back seek required),
-		 * since back seek takes more time than forward.
-		 */
-		if (s1 <= s2)
-			return rq1;
-		else
-			return rq2;
-	}
-}
-
-static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
-{
-	/* Service tree is empty */
-	if (!root->count)
-		return NULL;
-
-	return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
-}
-
-static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
-{
-	return rb_entry_cfqg(rb_first_cached(&root->rb));
-}
-
-static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
-{
-	if (root->rb_rightmost == n)
-		root->rb_rightmost = rb_prev(n);
-
-	rb_erase_cached(n, &root->rb);
-	RB_CLEAR_NODE(n);
-
-	--root->count;
-}
-
-/*
- * would be nice to take fifo expire time into account as well
- */
-static struct request *
-cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		  struct request *last)
-{
-	struct rb_node *rbnext = rb_next(&last->rb_node);
-	struct rb_node *rbprev = rb_prev(&last->rb_node);
-	struct request *next = NULL, *prev = NULL;
-
-	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
-
-	if (rbprev)
-		prev = rb_entry_rq(rbprev);
-
-	if (rbnext)
-		next = rb_entry_rq(rbnext);
-	else {
-		rbnext = rb_first(&cfqq->sort_list);
-		if (rbnext && rbnext != &last->rb_node)
-			next = rb_entry_rq(rbnext);
-	}
-
-	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
-}
-
-static u64 cfq_slice_offset(struct cfq_data *cfqd,
-			    struct cfq_queue *cfqq)
-{
-	/*
-	 * just an approximation, should be ok.
-	 */
-	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
-		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
-}
-
-static inline s64
-cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
-{
-	return cfqg->vdisktime - st->min_vdisktime;
-}
-
-static void
-__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
-{
-	struct rb_node **node = &st->rb.rb_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct cfq_group *__cfqg;
-	s64 key = cfqg_key(st, cfqg);
-	bool leftmost = true, rightmost = true;
-
-	while (*node != NULL) {
-		parent = *node;
-		__cfqg = rb_entry_cfqg(parent);
-
-		if (key < cfqg_key(st, __cfqg)) {
-			node = &parent->rb_left;
-			rightmost = false;
-		} else {
-			node = &parent->rb_right;
-			leftmost = false;
-		}
-	}
-
-	if (rightmost)
-		st->rb_rightmost = &cfqg->rb_node;
-
-	rb_link_node(&cfqg->rb_node, parent, node);
-	rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
-}
-
-/*
- * This has to be called only on activation of cfqg
- */
-static void
-cfq_update_group_weight(struct cfq_group *cfqg)
-{
-	if (cfqg->new_weight) {
-		cfqg->weight = cfqg->new_weight;
-		cfqg->new_weight = 0;
-	}
-}
-
-static void
-cfq_update_group_leaf_weight(struct cfq_group *cfqg)
-{
-	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-
-	if (cfqg->new_leaf_weight) {
-		cfqg->leaf_weight = cfqg->new_leaf_weight;
-		cfqg->new_leaf_weight = 0;
-	}
-}
-
-static void
-cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
-{
-	unsigned int vfr = 1 << CFQ_SERVICE_SHIFT;	/* start with 1 */
-	struct cfq_group *pos = cfqg;
-	struct cfq_group *parent;
-	bool propagate;
-
-	/* add to the service tree */
-	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
-
-	/*
-	 * Update leaf_weight.  We cannot update weight at this point
-	 * because cfqg might already have been activated and is
-	 * contributing its current weight to the parent's child_weight.
-	 */
-	cfq_update_group_leaf_weight(cfqg);
-	__cfq_group_service_tree_add(st, cfqg);
-
-	/*
-	 * Activate @cfqg and calculate the portion of vfraction @cfqg is
-	 * entitled to.  vfraction is calculated by walking the tree
-	 * towards the root calculating the fraction it has at each level.
-	 * The compounded ratio is how much vfraction @cfqg owns.
-	 *
-	 * Start with the proportion tasks in this cfqg has against active
-	 * children cfqgs - its leaf_weight against children_weight.
-	 */
-	propagate = !pos->nr_active++;
-	pos->children_weight += pos->leaf_weight;
-	vfr = vfr * pos->leaf_weight / pos->children_weight;
-
-	/*
-	 * Compound ->weight walking up the tree.  Both activation and
-	 * vfraction calculation are done in the same loop.  Propagation
-	 * stops once an already activated node is met.  vfraction
-	 * calculation should always continue to the root.
-	 */
-	while ((parent = cfqg_parent(pos))) {
-		if (propagate) {
-			cfq_update_group_weight(pos);
-			propagate = !parent->nr_active++;
-			parent->children_weight += pos->weight;
-		}
-		vfr = vfr * pos->weight / parent->children_weight;
-		pos = parent;
-	}
-
-	cfqg->vfraction = max_t(unsigned, vfr, 1);
-}
-
-static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
-{
-	if (!iops_mode(cfqd))
-		return CFQ_SLICE_MODE_GROUP_DELAY;
-	else
-		return CFQ_IOPS_MODE_GROUP_DELAY;
-}
-
-static void
-cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *__cfqg;
-	struct rb_node *n;
-
-	cfqg->nr_cfqq++;
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		return;
-
-	/*
-	 * Currently put the group at the end. Later implement something
-	 * so that groups get lesser vtime based on their weights, so that
-	 * if group does not loose all if it was not continuously backlogged.
-	 */
-	n = st->rb_rightmost;
-	if (n) {
-		__cfqg = rb_entry_cfqg(n);
-		cfqg->vdisktime = __cfqg->vdisktime +
-			cfq_get_cfqg_vdisktime_delay(cfqd);
-	} else
-		cfqg->vdisktime = st->min_vdisktime;
-	cfq_group_service_tree_add(st, cfqg);
-}
-
-static void
-cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
-{
-	struct cfq_group *pos = cfqg;
-	bool propagate;
-
-	/*
-	 * Undo activation from cfq_group_service_tree_add().  Deactivate
-	 * @cfqg and propagate deactivation upwards.
-	 */
-	propagate = !--pos->nr_active;
-	pos->children_weight -= pos->leaf_weight;
-
-	while (propagate) {
-		struct cfq_group *parent = cfqg_parent(pos);
-
-		/* @pos has 0 nr_active at this point */
-		WARN_ON_ONCE(pos->children_weight);
-		pos->vfraction = 0;
-
-		if (!parent)
-			break;
-
-		propagate = !--parent->nr_active;
-		parent->children_weight -= pos->weight;
-		pos = parent;
-	}
-
-	/* remove from the service tree */
-	if (!RB_EMPTY_NODE(&cfqg->rb_node))
-		cfq_rb_erase(&cfqg->rb_node, st);
-}
-
-static void
-cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-
-	BUG_ON(cfqg->nr_cfqq < 1);
-	cfqg->nr_cfqq--;
-
-	/* If there are other cfq queues under this group, don't delete it */
-	if (cfqg->nr_cfqq)
-		return;
-
-	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-	cfq_group_service_tree_del(st, cfqg);
-	cfqg->saved_wl_slice = 0;
-	cfqg_stats_update_dequeue(cfqg);
-}
-
-static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
-				       u64 *unaccounted_time)
-{
-	u64 slice_used;
-	u64 now = ktime_get_ns();
-
-	/*
-	 * Queue got expired before even a single request completed or
-	 * got expired immediately after first request completion.
-	 */
-	if (!cfqq->slice_start || cfqq->slice_start == now) {
-		/*
-		 * Also charge the seek time incurred to the group, otherwise
-		 * if there are mutiple queues in the group, each can dispatch
-		 * a single request on seeky media and cause lots of seek time
-		 * and group will never know it.
-		 */
-		slice_used = max_t(u64, (now - cfqq->dispatch_start),
-					jiffies_to_nsecs(1));
-	} else {
-		slice_used = now - cfqq->slice_start;
-		if (slice_used > cfqq->allocated_slice) {
-			*unaccounted_time = slice_used - cfqq->allocated_slice;
-			slice_used = cfqq->allocated_slice;
-		}
-		if (cfqq->slice_start > cfqq->dispatch_start)
-			*unaccounted_time += cfqq->slice_start -
-					cfqq->dispatch_start;
-	}
-
-	return slice_used;
-}
-
-static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
-				struct cfq_queue *cfqq)
-{
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	u64 used_sl, charge, unaccounted_sl = 0;
-	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
-			- cfqg->service_tree_idle.count;
-	unsigned int vfr;
-	u64 now = ktime_get_ns();
-
-	BUG_ON(nr_sync < 0);
-	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
-
-	if (iops_mode(cfqd))
-		charge = cfqq->slice_dispatch;
-	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
-		charge = cfqq->allocated_slice;
-
-	/*
-	 * Can't update vdisktime while on service tree and cfqg->vfraction
-	 * is valid only while on it.  Cache vfr, leave the service tree,
-	 * update vdisktime and go back on.  The re-addition to the tree
-	 * will also update the weights as necessary.
-	 */
-	vfr = cfqg->vfraction;
-	cfq_group_service_tree_del(st, cfqg);
-	cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
-	cfq_group_service_tree_add(st, cfqg);
-
-	/* This group is being expired. Save the context */
-	if (cfqd->workload_expires > now) {
-		cfqg->saved_wl_slice = cfqd->workload_expires - now;
-		cfqg->saved_wl_type = cfqd->serving_wl_type;
-		cfqg->saved_wl_class = cfqd->serving_wl_class;
-	} else
-		cfqg->saved_wl_slice = 0;
-
-	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
-					st->min_vdisktime);
-	cfq_log_cfqq(cfqq->cfqd, cfqq,
-		     "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
-		     used_sl, cfqq->slice_dispatch, charge,
-		     iops_mode(cfqd), cfqq->nr_sectors);
-	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
-	cfqg_stats_set_start_empty_time(cfqg);
-}
-
-/**
- * cfq_init_cfqg_base - initialize base part of a cfq_group
- * @cfqg: cfq_group to initialize
- *
- * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
- * is enabled or not.
- */
-static void cfq_init_cfqg_base(struct cfq_group *cfqg)
-{
-	struct cfq_rb_root *st;
-	int i, j;
-
-	for_each_cfqg_st(cfqg, i, j, st)
-		*st = CFQ_RB_ROOT;
-	RB_CLEAR_NODE(&cfqg->rb_node);
-
-	cfqg->ttime.last_end_request = ktime_get_ns();
-}
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
-			    bool on_dfl, bool reset_dev, bool is_leaf_weight);
-
-static void cfqg_stats_exit(struct cfqg_stats *stats)
-{
-	blkg_rwstat_exit(&stats->merged);
-	blkg_rwstat_exit(&stats->service_time);
-	blkg_rwstat_exit(&stats->wait_time);
-	blkg_rwstat_exit(&stats->queued);
-	blkg_stat_exit(&stats->time);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	blkg_stat_exit(&stats->unaccounted_time);
-	blkg_stat_exit(&stats->avg_queue_size_sum);
-	blkg_stat_exit(&stats->avg_queue_size_samples);
-	blkg_stat_exit(&stats->dequeue);
-	blkg_stat_exit(&stats->group_wait_time);
-	blkg_stat_exit(&stats->idle_time);
-	blkg_stat_exit(&stats->empty_time);
-#endif
-}
-
-static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp)
-{
-	if (blkg_rwstat_init(&stats->merged, gfp) ||
-	    blkg_rwstat_init(&stats->service_time, gfp) ||
-	    blkg_rwstat_init(&stats->wait_time, gfp) ||
-	    blkg_rwstat_init(&stats->queued, gfp) ||
-	    blkg_stat_init(&stats->time, gfp))
-		goto err;
-
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	if (blkg_stat_init(&stats->unaccounted_time, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
-	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
-	    blkg_stat_init(&stats->dequeue, gfp) ||
-	    blkg_stat_init(&stats->group_wait_time, gfp) ||
-	    blkg_stat_init(&stats->idle_time, gfp) ||
-	    blkg_stat_init(&stats->empty_time, gfp))
-		goto err;
-#endif
-	return 0;
-err:
-	cfqg_stats_exit(stats);
-	return -ENOMEM;
-}
-
-static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
-{
-	struct cfq_group_data *cgd;
-
-	cgd = kzalloc(sizeof(*cgd), gfp);
-	if (!cgd)
-		return NULL;
-	return &cgd->cpd;
-}
-
-static void cfq_cpd_init(struct blkcg_policy_data *cpd)
-{
-	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
-	unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
-			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
-
-	if (cpd_to_blkcg(cpd) == &blkcg_root)
-		weight *= 2;
-
-	cgd->weight = weight;
-	cgd->leaf_weight = weight;
-}
-
-static void cfq_cpd_free(struct blkcg_policy_data *cpd)
-{
-	kfree(cpd_to_cfqgd(cpd));
-}
-
-static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
-{
-	struct blkcg *blkcg = cpd_to_blkcg(cpd);
-	bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
-	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
-
-	if (blkcg == &blkcg_root)
-		weight *= 2;
-
-	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false));
-	WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true));
-}
-
-static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node)
-{
-	struct cfq_group *cfqg;
-
-	cfqg = kzalloc_node(sizeof(*cfqg), gfp, node);
-	if (!cfqg)
-		return NULL;
-
-	cfq_init_cfqg_base(cfqg);
-	if (cfqg_stats_init(&cfqg->stats, gfp)) {
-		kfree(cfqg);
-		return NULL;
-	}
-
-	return &cfqg->pd;
-}
-
-static void cfq_pd_init(struct blkg_policy_data *pd)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg);
-
-	cfqg->weight = cgd->weight;
-	cfqg->leaf_weight = cgd->leaf_weight;
-}
-
-static void cfq_pd_offline(struct blkg_policy_data *pd)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-	int i;
-
-	for (i = 0; i < IOPRIO_BE_NR; i++) {
-		if (cfqg->async_cfqq[0][i]) {
-			cfq_put_queue(cfqg->async_cfqq[0][i]);
-			cfqg->async_cfqq[0][i] = NULL;
-		}
-		if (cfqg->async_cfqq[1][i]) {
-			cfq_put_queue(cfqg->async_cfqq[1][i]);
-			cfqg->async_cfqq[1][i] = NULL;
-		}
-	}
-
-	if (cfqg->async_idle_cfqq) {
-		cfq_put_queue(cfqg->async_idle_cfqq);
-		cfqg->async_idle_cfqq = NULL;
-	}
-
-	/*
-	 * @blkg is going offline and will be ignored by
-	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
-	 * that they don't get lost.  If IOs complete after this point, the
-	 * stats for them will be lost.  Oh well...
-	 */
-	cfqg_stats_xfer_dead(cfqg);
-}
-
-static void cfq_pd_free(struct blkg_policy_data *pd)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-
-	cfqg_stats_exit(&cfqg->stats);
-	return kfree(cfqg);
-}
-
-static void cfq_pd_reset_stats(struct blkg_policy_data *pd)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-
-	cfqg_stats_reset(&cfqg->stats);
-}
-
-static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
-					 struct blkcg *blkcg)
-{
-	struct blkcg_gq *blkg;
-
-	blkg = blkg_lookup(blkcg, cfqd->queue);
-	if (likely(blkg))
-		return blkg_to_cfqg(blkg);
-	return NULL;
-}
-
-static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
-{
-	cfqq->cfqg = cfqg;
-	/* cfqq reference on cfqg */
-	cfqg_get(cfqg);
-}
-
-static u64 cfqg_prfill_weight_device(struct seq_file *sf,
-				     struct blkg_policy_data *pd, int off)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-
-	if (!cfqg->dev_weight)
-		return 0;
-	return __blkg_prfill_u64(sf, pd, cfqg->dev_weight);
-}
-
-static int cfqg_print_weight_device(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_weight_device, &blkcg_policy_cfq,
-			  0, false);
-	return 0;
-}
-
-static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf,
-					  struct blkg_policy_data *pd, int off)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-
-	if (!cfqg->dev_leaf_weight)
-		return 0;
-	return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight);
-}
-
-static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq,
-			  0, false);
-	return 0;
-}
-
-static int cfq_print_weight(struct seq_file *sf, void *v)
-{
-	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
-	unsigned int val = 0;
-
-	if (cgd)
-		val = cgd->weight;
-
-	seq_printf(sf, "%u\n", val);
-	return 0;
-}
-
-static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
-{
-	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
-	unsigned int val = 0;
-
-	if (cgd)
-		val = cgd->leaf_weight;
-
-	seq_printf(sf, "%u\n", val);
-	return 0;
-}
-
-static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
-					char *buf, size_t nbytes, loff_t off,
-					bool on_dfl, bool is_leaf_weight)
-{
-	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
-	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
-	struct blkcg *blkcg = css_to_blkcg(of_css(of));
-	struct blkg_conf_ctx ctx;
-	struct cfq_group *cfqg;
-	struct cfq_group_data *cfqgd;
-	int ret;
-	u64 v;
-
-	ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
-	if (ret)
-		return ret;
-
-	if (sscanf(ctx.body, "%llu", &v) == 1) {
-		/* require "default" on dfl */
-		ret = -ERANGE;
-		if (!v && on_dfl)
-			goto out_finish;
-	} else if (!strcmp(strim(ctx.body), "default")) {
-		v = 0;
-	} else {
-		ret = -EINVAL;
-		goto out_finish;
-	}
-
-	cfqg = blkg_to_cfqg(ctx.blkg);
-	cfqgd = blkcg_to_cfqgd(blkcg);
-
-	ret = -ERANGE;
-	if (!v || (v >= min && v <= max)) {
-		if (!is_leaf_weight) {
-			cfqg->dev_weight = v;
-			cfqg->new_weight = v ?: cfqgd->weight;
-		} else {
-			cfqg->dev_leaf_weight = v;
-			cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight;
-		}
-		ret = 0;
-	}
-out_finish:
-	blkg_conf_finish(&ctx);
-	return ret ?: nbytes;
-}
-
-static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
-				      char *buf, size_t nbytes, loff_t off)
-{
-	return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
-}
-
-static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
-					   char *buf, size_t nbytes, loff_t off)
-{
-	return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
-}
-
-static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
-			    bool on_dfl, bool reset_dev, bool is_leaf_weight)
-{
-	unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN;
-	unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX;
-	struct blkcg *blkcg = css_to_blkcg(css);
-	struct blkcg_gq *blkg;
-	struct cfq_group_data *cfqgd;
-	int ret = 0;
-
-	if (val < min || val > max)
-		return -ERANGE;
-
-	spin_lock_irq(&blkcg->lock);
-	cfqgd = blkcg_to_cfqgd(blkcg);
-	if (!cfqgd) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (!is_leaf_weight)
-		cfqgd->weight = val;
-	else
-		cfqgd->leaf_weight = val;
-
-	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
-		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
-
-		if (!cfqg)
-			continue;
-
-		if (!is_leaf_weight) {
-			if (reset_dev)
-				cfqg->dev_weight = 0;
-			if (!cfqg->dev_weight)
-				cfqg->new_weight = cfqgd->weight;
-		} else {
-			if (reset_dev)
-				cfqg->dev_leaf_weight = 0;
-			if (!cfqg->dev_leaf_weight)
-				cfqg->new_leaf_weight = cfqgd->leaf_weight;
-		}
-	}
-
-out:
-	spin_unlock_irq(&blkcg->lock);
-	return ret;
-}
-
-static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
-			  u64 val)
-{
-	return __cfq_set_weight(css, val, false, false, false);
-}
-
-static int cfq_set_leaf_weight(struct cgroup_subsys_state *css,
-			       struct cftype *cft, u64 val)
-{
-	return __cfq_set_weight(css, val, false, false, true);
-}
-
-static int cfqg_print_stat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
-			  &blkcg_policy_cfq, seq_cft(sf)->private, false);
-	return 0;
-}
-
-static int cfqg_print_rwstat(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
-			  &blkcg_policy_cfq, seq_cft(sf)->private, true);
-	return 0;
-}
-
-static u64 cfqg_prfill_stat_recursive(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
-					  &blkcg_policy_cfq, off);
-	return __blkg_prfill_u64(sf, pd, sum);
-}
-
-static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf,
-					struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
-							&blkcg_policy_cfq, off);
-	return __blkg_prfill_rwstat(sf, pd, &sum);
-}
-
-static int cfqg_print_stat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_stat_recursive, &blkcg_policy_cfq,
-			  seq_cft(sf)->private, false);
-	return 0;
-}
-
-static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq,
-			  seq_cft(sf)->private, true);
-	return 0;
-}
-
-static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
-			       int off)
-{
-	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int cfqg_print_stat_sectors(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false);
-	return 0;
-}
-
-static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf,
-					 struct blkg_policy_data *pd, int off)
-{
-	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
-					offsetof(struct blkcg_gq, stat_bytes));
-	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
-		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
-
-	return __blkg_prfill_u64(sf, pd, sum >> 9);
-}
-
-static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0,
-			  false);
-	return 0;
-}
-
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
-				      struct blkg_policy_data *pd, int off)
-{
-	struct cfq_group *cfqg = pd_to_cfqg(pd);
-	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
-	u64 v = 0;
-
-	if (samples) {
-		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
-		v = div64_u64(v, samples);
-	}
-	__blkg_prfill_u64(sf, pd, v);
-	return 0;
-}
-
-/* print avg_queue_size */
-static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v)
-{
-	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
-			  cfqg_prfill_avg_queue_size, &blkcg_policy_cfq,
-			  0, false);
-	return 0;
-}
-#endif	/* CONFIG_DEBUG_BLK_CGROUP */
-
-static struct cftype cfq_blkcg_legacy_files[] = {
-	/* on root, weight is mapped to leaf_weight */
-	{
-		.name = "weight_device",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cfqg_print_leaf_weight_device,
-		.write = cfqg_set_leaf_weight_device,
-	},
-	{
-		.name = "weight",
-		.flags = CFTYPE_ONLY_ON_ROOT,
-		.seq_show = cfq_print_leaf_weight,
-		.write_u64 = cfq_set_leaf_weight,
-	},
-
-	/* no such mapping necessary for !roots */
-	{
-		.name = "weight_device",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cfqg_print_weight_device,
-		.write = cfqg_set_weight_device,
-	},
-	{
-		.name = "weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cfq_print_weight,
-		.write_u64 = cfq_set_weight,
-	},
-
-	{
-		.name = "leaf_weight_device",
-		.seq_show = cfqg_print_leaf_weight_device,
-		.write = cfqg_set_leaf_weight_device,
-	},
-	{
-		.name = "leaf_weight",
-		.seq_show = cfq_print_leaf_weight,
-		.write_u64 = cfq_set_leaf_weight,
-	},
-
-	/* statistics, covers only the tasks in the cfqg */
-	{
-		.name = "time",
-		.private = offsetof(struct cfq_group, stats.time),
-		.seq_show = cfqg_print_stat,
-	},
-	{
-		.name = "sectors",
-		.seq_show = cfqg_print_stat_sectors,
-	},
-	{
-		.name = "io_service_bytes",
-		.private = (unsigned long)&blkcg_policy_cfq,
-		.seq_show = blkg_print_stat_bytes,
-	},
-	{
-		.name = "io_serviced",
-		.private = (unsigned long)&blkcg_policy_cfq,
-		.seq_show = blkg_print_stat_ios,
-	},
-	{
-		.name = "io_service_time",
-		.private = offsetof(struct cfq_group, stats.service_time),
-		.seq_show = cfqg_print_rwstat,
-	},
-	{
-		.name = "io_wait_time",
-		.private = offsetof(struct cfq_group, stats.wait_time),
-		.seq_show = cfqg_print_rwstat,
-	},
-	{
-		.name = "io_merged",
-		.private = offsetof(struct cfq_group, stats.merged),
-		.seq_show = cfqg_print_rwstat,
-	},
-	{
-		.name = "io_queued",
-		.private = offsetof(struct cfq_group, stats.queued),
-		.seq_show = cfqg_print_rwstat,
-	},
-
-	/* the same statictics which cover the cfqg and its descendants */
-	{
-		.name = "time_recursive",
-		.private = offsetof(struct cfq_group, stats.time),
-		.seq_show = cfqg_print_stat_recursive,
-	},
-	{
-		.name = "sectors_recursive",
-		.seq_show = cfqg_print_stat_sectors_recursive,
-	},
-	{
-		.name = "io_service_bytes_recursive",
-		.private = (unsigned long)&blkcg_policy_cfq,
-		.seq_show = blkg_print_stat_bytes_recursive,
-	},
-	{
-		.name = "io_serviced_recursive",
-		.private = (unsigned long)&blkcg_policy_cfq,
-		.seq_show = blkg_print_stat_ios_recursive,
-	},
-	{
-		.name = "io_service_time_recursive",
-		.private = offsetof(struct cfq_group, stats.service_time),
-		.seq_show = cfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "io_wait_time_recursive",
-		.private = offsetof(struct cfq_group, stats.wait_time),
-		.seq_show = cfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "io_merged_recursive",
-		.private = offsetof(struct cfq_group, stats.merged),
-		.seq_show = cfqg_print_rwstat_recursive,
-	},
-	{
-		.name = "io_queued_recursive",
-		.private = offsetof(struct cfq_group, stats.queued),
-		.seq_show = cfqg_print_rwstat_recursive,
-	},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-	{
-		.name = "avg_queue_size",
-		.seq_show = cfqg_print_avg_queue_size,
-	},
-	{
-		.name = "group_wait_time",
-		.private = offsetof(struct cfq_group, stats.group_wait_time),
-		.seq_show = cfqg_print_stat,
-	},
-	{
-		.name = "idle_time",
-		.private = offsetof(struct cfq_group, stats.idle_time),
-		.seq_show = cfqg_print_stat,
-	},
-	{
-		.name = "empty_time",
-		.private = offsetof(struct cfq_group, stats.empty_time),
-		.seq_show = cfqg_print_stat,
-	},
-	{
-		.name = "dequeue",
-		.private = offsetof(struct cfq_group, stats.dequeue),
-		.seq_show = cfqg_print_stat,
-	},
-	{
-		.name = "unaccounted_time",
-		.private = offsetof(struct cfq_group, stats.unaccounted_time),
-		.seq_show = cfqg_print_stat,
-	},
-#endif	/* CONFIG_DEBUG_BLK_CGROUP */
-	{ }	/* terminate */
-};
-
-static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
-{
-	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
-	struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
-
-	seq_printf(sf, "default %u\n", cgd->weight);
-	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
-			  &blkcg_policy_cfq, 0, false);
-	return 0;
-}
-
-static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
-				     char *buf, size_t nbytes, loff_t off)
-{
-	char *endp;
-	int ret;
-	u64 v;
-
-	buf = strim(buf);
-
-	/* "WEIGHT" or "default WEIGHT" sets the default weight */
-	v = simple_strtoull(buf, &endp, 0);
-	if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
-		ret = __cfq_set_weight(of_css(of), v, true, false, false);
-		return ret ?: nbytes;
-	}
-
-	/* "MAJ:MIN WEIGHT" */
-	return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
-}
-
-static struct cftype cfq_blkcg_files[] = {
-	{
-		.name = "weight",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cfq_print_weight_on_dfl,
-		.write = cfq_set_weight_on_dfl,
-	},
-	{ }	/* terminate */
-};
-
-#else /* GROUP_IOSCHED */
-static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
-					 struct blkcg *blkcg)
-{
-	return cfqd->root_group;
-}
-
-static inline void
-cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
-	cfqq->cfqg = cfqg;
-}
-
-#endif /* GROUP_IOSCHED */
-
-/*
- * The cfqd->service_trees holds all pending cfq_queue's that have
- * requests waiting to be processed. It is sorted in the order that
- * we will service the queues.
- */
-static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-				 bool add_front)
-{
-	struct rb_node **p, *parent;
-	struct cfq_queue *__cfqq;
-	u64 rb_key;
-	struct cfq_rb_root *st;
-	bool leftmost = true;
-	int new_cfqq = 1;
-	u64 now = ktime_get_ns();
-
-	st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq));
-	if (cfq_class_idle(cfqq)) {
-		rb_key = CFQ_IDLE_DELAY;
-		parent = st->rb_rightmost;
-		if (parent && parent != &cfqq->rb_node) {
-			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
-			rb_key += __cfqq->rb_key;
-		} else
-			rb_key += now;
-	} else if (!add_front) {
-		/*
-		 * Get our rb key offset. Subtract any residual slice
-		 * value carried from last service. A negative resid
-		 * count indicates slice overrun, and this should position
-		 * the next service time further away in the tree.
-		 */
-		rb_key = cfq_slice_offset(cfqd, cfqq) + now;
-		rb_key -= cfqq->slice_resid;
-		cfqq->slice_resid = 0;
-	} else {
-		rb_key = -NSEC_PER_SEC;
-		__cfqq = cfq_rb_first(st);
-		rb_key += __cfqq ? __cfqq->rb_key : now;
-	}
-
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
-		new_cfqq = 0;
-		/*
-		 * same position, nothing more to do
-		 */
-		if (rb_key == cfqq->rb_key && cfqq->service_tree == st)
-			return;
-
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
-	}
-
-	parent = NULL;
-	cfqq->service_tree = st;
-	p = &st->rb.rb_root.rb_node;
-	while (*p) {
-		parent = *p;
-		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
-
-		/*
-		 * sort by key, that represents service time.
-		 */
-		if (rb_key < __cfqq->rb_key)
-			p = &parent->rb_left;
-		else {
-			p = &parent->rb_right;
-			leftmost = false;
-		}
-	}
-
-	cfqq->rb_key = rb_key;
-	rb_link_node(&cfqq->rb_node, parent, p);
-	rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost);
-	st->count++;
-	if (add_front || !new_cfqq)
-		return;
-	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
-}
-
-static struct cfq_queue *
-cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
-		     sector_t sector, struct rb_node **ret_parent,
-		     struct rb_node ***rb_link)
-{
-	struct rb_node **p, *parent;
-	struct cfq_queue *cfqq = NULL;
-
-	parent = NULL;
-	p = &root->rb_node;
-	while (*p) {
-		struct rb_node **n;
-
-		parent = *p;
-		cfqq = rb_entry(parent, struct cfq_queue, p_node);
-
-		/*
-		 * Sort strictly based on sector.  Smallest to the left,
-		 * largest to the right.
-		 */
-		if (sector > blk_rq_pos(cfqq->next_rq))
-			n = &(*p)->rb_right;
-		else if (sector < blk_rq_pos(cfqq->next_rq))
-			n = &(*p)->rb_left;
-		else
-			break;
-		p = n;
-		cfqq = NULL;
-	}
-
-	*ret_parent = parent;
-	if (rb_link)
-		*rb_link = p;
-	return cfqq;
-}
-
-static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	struct rb_node **p, *parent;
-	struct cfq_queue *__cfqq;
-
-	if (cfqq->p_root) {
-		rb_erase(&cfqq->p_node, cfqq->p_root);
-		cfqq->p_root = NULL;
-	}
-
-	if (cfq_class_idle(cfqq))
-		return;
-	if (!cfqq->next_rq)
-		return;
-
-	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
-	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
-				      blk_rq_pos(cfqq->next_rq), &parent, &p);
-	if (!__cfqq) {
-		rb_link_node(&cfqq->p_node, parent, p);
-		rb_insert_color(&cfqq->p_node, cfqq->p_root);
-	} else
-		cfqq->p_root = NULL;
-}
-
-/*
- * Update cfqq's position in the service tree.
- */
-static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	/*
-	 * Resorting requires the cfqq to be on the RR list already.
-	 */
-	if (cfq_cfqq_on_rr(cfqq)) {
-		cfq_service_tree_add(cfqd, cfqq, 0);
-		cfq_prio_tree_add(cfqd, cfqq);
-	}
-}
-
-/*
- * add to busy list of queues for service, trying to be fair in ordering
- * the pending list according to last request service
- */
-static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
-	BUG_ON(cfq_cfqq_on_rr(cfqq));
-	cfq_mark_cfqq_on_rr(cfqq);
-	cfqd->busy_queues++;
-	if (cfq_cfqq_sync(cfqq))
-		cfqd->busy_sync_queues++;
-
-	cfq_resort_rr_list(cfqd, cfqq);
-}
-
-/*
- * Called when the cfqq no longer has requests pending, remove it from
- * the service tree.
- */
-static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
-	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-	cfq_clear_cfqq_on_rr(cfqq);
-
-	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
-		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
-		cfqq->service_tree = NULL;
-	}
-	if (cfqq->p_root) {
-		rb_erase(&cfqq->p_node, cfqq->p_root);
-		cfqq->p_root = NULL;
-	}
-
-	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
-	BUG_ON(!cfqd->busy_queues);
-	cfqd->busy_queues--;
-	if (cfq_cfqq_sync(cfqq))
-		cfqd->busy_sync_queues--;
-}
-
-/*
- * rb tree support functions
- */
-static void cfq_del_rq_rb(struct request *rq)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-	const int sync = rq_is_sync(rq);
-
-	BUG_ON(!cfqq->queued[sync]);
-	cfqq->queued[sync]--;
-
-	elv_rb_del(&cfqq->sort_list, rq);
-
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
-		/*
-		 * Queue will be deleted from service tree when we actually
-		 * expire it later. Right now just remove it from prio tree
-		 * as it is empty.
-		 */
-		if (cfqq->p_root) {
-			rb_erase(&cfqq->p_node, cfqq->p_root);
-			cfqq->p_root = NULL;
-		}
-	}
-}
-
-static void cfq_add_rq_rb(struct request *rq)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-	struct cfq_data *cfqd = cfqq->cfqd;
-	struct request *prev;
-
-	cfqq->queued[rq_is_sync(rq)]++;
-
-	elv_rb_add(&cfqq->sort_list, rq);
-
-	if (!cfq_cfqq_on_rr(cfqq))
-		cfq_add_cfqq_rr(cfqd, cfqq);
-
-	/*
-	 * check if this request is a better next-serve candidate
-	 */
-	prev = cfqq->next_rq;
-	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
-
-	/*
-	 * adjust priority tree position, if ->next_rq changes
-	 */
-	if (prev != cfqq->next_rq)
-		cfq_prio_tree_add(cfqd, cfqq);
-
-	BUG_ON(!cfqq->next_rq);
-}
-
-static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
-{
-	elv_rb_del(&cfqq->sort_list, rq);
-	cfqq->queued[rq_is_sync(rq)]--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
-	cfq_add_rq_rb(rq);
-	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
-				 rq->cmd_flags);
-}
-
-static struct request *
-cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
-{
-	struct task_struct *tsk = current;
-	struct cfq_io_cq *cic;
-	struct cfq_queue *cfqq;
-
-	cic = cfq_cic_lookup(cfqd, tsk->io_context);
-	if (!cic)
-		return NULL;
-
-	cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
-	if (cfqq)
-		return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
-
-	return NULL;
-}
-
-static void cfq_activate_request(struct request_queue *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	cfqd->rq_in_driver++;
-	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
-						cfqd->rq_in_driver);
-
-	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
-}
-
-static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	WARN_ON(!cfqd->rq_in_driver);
-	cfqd->rq_in_driver--;
-	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
-						cfqd->rq_in_driver);
-}
-
-static void cfq_remove_request(struct request *rq)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-
-	if (cfqq->next_rq == rq)
-		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
-
-	list_del_init(&rq->queuelist);
-	cfq_del_rq_rb(rq);
-
-	cfqq->cfqd->rq_queued--;
-	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
-	if (rq->cmd_flags & REQ_PRIO) {
-		WARN_ON(!cfqq->prio_pending);
-		cfqq->prio_pending--;
-	}
-}
-
-static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
-		     struct bio *bio)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct request *__rq;
-
-	__rq = cfq_find_rq_fmerge(cfqd, bio);
-	if (__rq && elv_bio_merge_ok(__rq, bio)) {
-		*req = __rq;
-		return ELEVATOR_FRONT_MERGE;
-	}
-
-	return ELEVATOR_NO_MERGE;
-}
-
-static void cfq_merged_request(struct request_queue *q, struct request *req,
-			       enum elv_merge type)
-{
-	if (type == ELEVATOR_FRONT_MERGE) {
-		struct cfq_queue *cfqq = RQ_CFQQ(req);
-
-		cfq_reposition_rq_rb(cfqq, req);
-	}
-}
-
-static void cfq_bio_merged(struct request_queue *q, struct request *req,
-				struct bio *bio)
-{
-	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf);
-}
-
-static void
-cfq_merged_requests(struct request_queue *q, struct request *rq,
-		    struct request *next)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-
-	/*
-	 * reposition in fifo if next is older than rq
-	 */
-	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
-	    next->fifo_time < rq->fifo_time &&
-	    cfqq == RQ_CFQQ(next)) {
-		list_move(&rq->queuelist, &next->queuelist);
-		rq->fifo_time = next->fifo_time;
-	}
-
-	if (cfqq->next_rq == next)
-		cfqq->next_rq = rq;
-	cfq_remove_request(next);
-	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
-
-	cfqq = RQ_CFQQ(next);
-	/*
-	 * all requests of this queue are merged to other queues, delete it
-	 * from the service tree. If it's the active_queue,
-	 * cfq_dispatch_requests() will choose to expire it or do idle
-	 */
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
-	    cfqq != cfqd->active_queue)
-		cfq_del_cfqq_rr(cfqd, cfqq);
-}
-
-static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq,
-			       struct bio *bio)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	bool is_sync = op_is_sync(bio->bi_opf);
-	struct cfq_io_cq *cic;
-	struct cfq_queue *cfqq;
-
-	/*
-	 * Disallow merge of a sync bio into an async request.
-	 */
-	if (is_sync && !rq_is_sync(rq))
-		return false;
-
-	/*
-	 * Lookup the cfqq that this bio will be queued with and allow
-	 * merge only if rq is queued there.
-	 */
-	cic = cfq_cic_lookup(cfqd, current->io_context);
-	if (!cic)
-		return false;
-
-	cfqq = cic_to_cfqq(cic, is_sync);
-	return cfqq == RQ_CFQQ(rq);
-}
-
-static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq,
-			      struct request *next)
-{
-	return RQ_CFQQ(rq) == RQ_CFQQ(next);
-}
-
-static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	hrtimer_try_to_cancel(&cfqd->idle_slice_timer);
-	cfqg_stats_update_idle_time(cfqq->cfqg);
-}
-
-static void __cfq_set_active_queue(struct cfq_data *cfqd,
-				   struct cfq_queue *cfqq)
-{
-	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d",
-				cfqd->serving_wl_class, cfqd->serving_wl_type);
-		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
-		cfqq->slice_start = 0;
-		cfqq->dispatch_start = ktime_get_ns();
-		cfqq->allocated_slice = 0;
-		cfqq->slice_end = 0;
-		cfqq->slice_dispatch = 0;
-		cfqq->nr_sectors = 0;
-
-		cfq_clear_cfqq_wait_request(cfqq);
-		cfq_clear_cfqq_must_dispatch(cfqq);
-		cfq_clear_cfqq_must_alloc_slice(cfqq);
-		cfq_clear_cfqq_fifo_expire(cfqq);
-		cfq_mark_cfqq_slice_new(cfqq);
-
-		cfq_del_timer(cfqd, cfqq);
-	}
-
-	cfqd->active_queue = cfqq;
-}
-
-/*
- * current cfqq expired its slice (or was too idle), select new one
- */
-static void
-__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		    bool timed_out)
-{
-	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
-
-	if (cfq_cfqq_wait_request(cfqq))
-		cfq_del_timer(cfqd, cfqq);
-
-	cfq_clear_cfqq_wait_request(cfqq);
-	cfq_clear_cfqq_wait_busy(cfqq);
-
-	/*
-	 * If this cfqq is shared between multiple processes, check to
-	 * make sure that those processes are still issuing I/Os within
-	 * the mean seek distance.  If not, it may be time to break the
-	 * queues apart again.
-	 */
-	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
-		cfq_mark_cfqq_split_coop(cfqq);
-
-	/*
-	 * store what was left of this slice, if the queue idled/timed out
-	 */
-	if (timed_out) {
-		if (cfq_cfqq_slice_new(cfqq))
-			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
-		else
-			cfqq->slice_resid = cfqq->slice_end - ktime_get_ns();
-		cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid);
-	}
-
-	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
-
-	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
-		cfq_del_cfqq_rr(cfqd, cfqq);
-
-	cfq_resort_rr_list(cfqd, cfqq);
-
-	if (cfqq == cfqd->active_queue)
-		cfqd->active_queue = NULL;
-
-	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->icq.ioc);
-		cfqd->active_cic = NULL;
-	}
-}
-
-static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
-{
-	struct cfq_queue *cfqq = cfqd->active_queue;
-
-	if (cfqq)
-		__cfq_slice_expired(cfqd, cfqq, timed_out);
-}
-
-/*
- * Get next queue for service. Unless we have a queue preemption,
- * we'll simply select the first cfqq in the service tree.
- */
-static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
-{
-	struct cfq_rb_root *st = st_for(cfqd->serving_group,
-			cfqd->serving_wl_class, cfqd->serving_wl_type);
-
-	if (!cfqd->rq_queued)
-		return NULL;
-
-	/* There is nothing to dispatch */
-	if (!st)
-		return NULL;
-	if (RB_EMPTY_ROOT(&st->rb.rb_root))
-		return NULL;
-	return cfq_rb_first(st);
-}
-
-static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
-{
-	struct cfq_group *cfqg;
-	struct cfq_queue *cfqq;
-	int i, j;
-	struct cfq_rb_root *st;
-
-	if (!cfqd->rq_queued)
-		return NULL;
-
-	cfqg = cfq_get_next_cfqg(cfqd);
-	if (!cfqg)
-		return NULL;
-
-	for_each_cfqg_st(cfqg, i, j, st) {
-		cfqq = cfq_rb_first(st);
-		if (cfqq)
-			return cfqq;
-	}
-	return NULL;
-}
-
-/*
- * Get and set a new active queue for service.
- */
-static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
-					      struct cfq_queue *cfqq)
-{
-	if (!cfqq)
-		cfqq = cfq_get_next_queue(cfqd);
-
-	__cfq_set_active_queue(cfqd, cfqq);
-	return cfqq;
-}
-
-static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
-					  struct request *rq)
-{
-	if (blk_rq_pos(rq) >= cfqd->last_position)
-		return blk_rq_pos(rq) - cfqd->last_position;
-	else
-		return cfqd->last_position - blk_rq_pos(rq);
-}
-
-static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			       struct request *rq)
-{
-	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
-}
-
-static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
-				    struct cfq_queue *cur_cfqq)
-{
-	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
-	struct rb_node *parent, *node;
-	struct cfq_queue *__cfqq;
-	sector_t sector = cfqd->last_position;
-
-	if (RB_EMPTY_ROOT(root))
-		return NULL;
-
-	/*
-	 * First, if we find a request starting at the end of the last
-	 * request, choose it.
-	 */
-	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
-	if (__cfqq)
-		return __cfqq;
-
-	/*
-	 * If the exact sector wasn't found, the parent of the NULL leaf
-	 * will contain the closest sector.
-	 */
-	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
-		return __cfqq;
-
-	if (blk_rq_pos(__cfqq->next_rq) < sector)
-		node = rb_next(&__cfqq->p_node);
-	else
-		node = rb_prev(&__cfqq->p_node);
-	if (!node)
-		return NULL;
-
-	__cfqq = rb_entry(node, struct cfq_queue, p_node);
-	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
-		return __cfqq;
-
-	return NULL;
-}
-
-/*
- * cfqd - obvious
- * cur_cfqq - passed in so that we don't decide that the current queue is
- * 	      closely cooperating with itself.
- *
- * So, basically we're assuming that that cur_cfqq has dispatched at least
- * one request, and that cfqd->last_position reflects a position on the disk
- * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
- * assumption.
- */
-static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
-					      struct cfq_queue *cur_cfqq)
-{
-	struct cfq_queue *cfqq;
-
-	if (cfq_class_idle(cur_cfqq))
-		return NULL;
-	if (!cfq_cfqq_sync(cur_cfqq))
-		return NULL;
-	if (CFQQ_SEEKY(cur_cfqq))
-		return NULL;
-
-	/*
-	 * Don't search priority tree if it's the only queue in the group.
-	 */
-	if (cur_cfqq->cfqg->nr_cfqq == 1)
-		return NULL;
-
-	/*
-	 * We should notice if some of the queues are cooperating, eg
-	 * working closely on the same area of the disk. In that case,
-	 * we can group them together and don't waste time idling.
-	 */
-	cfqq = cfqq_close(cfqd, cur_cfqq);
-	if (!cfqq)
-		return NULL;
-
-	/* If new queue belongs to different cfq_group, don't choose it */
-	if (cur_cfqq->cfqg != cfqq->cfqg)
-		return NULL;
-
-	/*
-	 * It only makes sense to merge sync queues.
-	 */
-	if (!cfq_cfqq_sync(cfqq))
-		return NULL;
-	if (CFQQ_SEEKY(cfqq))
-		return NULL;
-
-	/*
-	 * Do not merge queues of different priority classes
-	 */
-	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
-		return NULL;
-
-	return cfqq;
-}
-
-/*
- * Determine whether we should enforce idle window for this queue.
- */
-
-static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	enum wl_class_t wl_class = cfqq_class(cfqq);
-	struct cfq_rb_root *st = cfqq->service_tree;
-
-	BUG_ON(!st);
-	BUG_ON(!st->count);
-
-	if (!cfqd->cfq_slice_idle)
-		return false;
-
-	/* We never do for idle class queues. */
-	if (wl_class == IDLE_WORKLOAD)
-		return false;
-
-	/* We do for queues that were marked with idle window flag. */
-	if (cfq_cfqq_idle_window(cfqq) &&
-	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
-		return true;
-
-	/*
-	 * Otherwise, we do only if they are the last ones
-	 * in their service tree.
-	 */
-	if (st->count == 1 && cfq_cfqq_sync(cfqq) &&
-	   !cfq_io_thinktime_big(cfqd, &st->ttime, false))
-		return true;
-	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count);
-	return false;
-}
-
-static void cfq_arm_slice_timer(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq = cfqd->active_queue;
-	struct cfq_rb_root *st = cfqq->service_tree;
-	struct cfq_io_cq *cic;
-	u64 sl, group_idle = 0;
-	u64 now = ktime_get_ns();
-
-	/*
-	 * SSD device without seek penalty, disable idling. But only do so
-	 * for devices that support queuing, otherwise we still have a problem
-	 * with sync vs async workloads.
-	 */
-	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag &&
-		!cfqd->cfq_group_idle)
-		return;
-
-	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
-	WARN_ON(cfq_cfqq_slice_new(cfqq));
-
-	/*
-	 * idle is disabled, either manually or by past process history
-	 */
-	if (!cfq_should_idle(cfqd, cfqq)) {
-		/* no queue idling. Check for group idling */
-		if (cfqd->cfq_group_idle)
-			group_idle = cfqd->cfq_group_idle;
-		else
-			return;
-	}
-
-	/*
-	 * still active requests from this queue, don't idle
-	 */
-	if (cfqq->dispatched)
-		return;
-
-	/*
-	 * task has exited, don't wait
-	 */
-	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
-		return;
-
-	/*
-	 * If our average think time is larger than the remaining time
-	 * slice, then don't idle. This avoids overrunning the allotted
-	 * time slice.
-	 */
-	if (sample_valid(cic->ttime.ttime_samples) &&
-	    (cfqq->slice_end - now < cic->ttime.ttime_mean)) {
-		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu",
-			     cic->ttime.ttime_mean);
-		return;
-	}
-
-	/*
-	 * There are other queues in the group or this is the only group and
-	 * it has too big thinktime, don't do group idle.
-	 */
-	if (group_idle &&
-	    (cfqq->cfqg->nr_cfqq > 1 ||
-	     cfq_io_thinktime_big(cfqd, &st->ttime, true)))
-		return;
-
-	cfq_mark_cfqq_wait_request(cfqq);
-
-	if (group_idle)
-		sl = cfqd->cfq_group_idle;
-	else
-		sl = cfqd->cfq_slice_idle;
-
-	hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl),
-		      HRTIMER_MODE_REL);
-	cfqg_stats_set_start_idle_time(cfqq->cfqg);
-	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl,
-			group_idle ? 1 : 0);
-}
-
-/*
- * Move request from internal lists to the request queue dispatch list.
- */
-static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-
-	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
-
-	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
-	cfq_remove_request(rq);
-	cfqq->dispatched++;
-	(RQ_CFQG(rq))->dispatched++;
-	elv_dispatch_sort(q, rq);
-
-	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
-	cfqq->nr_sectors += blk_rq_sectors(rq);
-}
-
-/*
- * return expired entry, or NULL to just start from scratch in rbtree
- */
-static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
-{
-	struct request *rq = NULL;
-
-	if (cfq_cfqq_fifo_expire(cfqq))
-		return NULL;
-
-	cfq_mark_cfqq_fifo_expire(cfqq);
-
-	if (list_empty(&cfqq->fifo))
-		return NULL;
-
-	rq = rq_entry_fifo(cfqq->fifo.next);
-	if (ktime_get_ns() < rq->fifo_time)
-		rq = NULL;
-
-	return rq;
-}
-
-static inline int
-cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	const int base_rq = cfqd->cfq_slice_async_rq;
-
-	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
-
-	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
-}
-
-/*
- * Must be called with the queue_lock held.
- */
-static int cfqq_process_refs(struct cfq_queue *cfqq)
-{
-	int process_refs, io_refs;
-
-	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-	process_refs = cfqq->ref - io_refs;
-	BUG_ON(process_refs < 0);
-	return process_refs;
-}
-
-static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
-{
-	int process_refs, new_process_refs;
-	struct cfq_queue *__cfqq;
-
-	/*
-	 * If there are no process references on the new_cfqq, then it is
-	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
-	 * chain may have dropped their last reference (not just their
-	 * last process reference).
-	 */
-	if (!cfqq_process_refs(new_cfqq))
-		return;
-
-	/* Avoid a circular list and skip interim queue merges */
-	while ((__cfqq = new_cfqq->new_cfqq)) {
-		if (__cfqq == cfqq)
-			return;
-		new_cfqq = __cfqq;
-	}
-
-	process_refs = cfqq_process_refs(cfqq);
-	new_process_refs = cfqq_process_refs(new_cfqq);
-	/*
-	 * If the process for the cfqq has gone away, there is no
-	 * sense in merging the queues.
-	 */
-	if (process_refs == 0 || new_process_refs == 0)
-		return;
-
-	/*
-	 * Merge in the direction of the lesser amount of work.
-	 */
-	if (new_process_refs >= process_refs) {
-		cfqq->new_cfqq = new_cfqq;
-		new_cfqq->ref += process_refs;
-	} else {
-		new_cfqq->new_cfqq = cfqq;
-		cfqq->ref += new_process_refs;
-	}
-}
-
-static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd,
-			struct cfq_group *cfqg, enum wl_class_t wl_class)
-{
-	struct cfq_queue *queue;
-	int i;
-	bool key_valid = false;
-	u64 lowest_key = 0;
-	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
-
-	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
-		/* select the one with lowest rb_key */
-		queue = cfq_rb_first(st_for(cfqg, wl_class, i));
-		if (queue &&
-		    (!key_valid || queue->rb_key < lowest_key)) {
-			lowest_key = queue->rb_key;
-			cur_best = i;
-			key_valid = true;
-		}
-	}
-
-	return cur_best;
-}
-
-static void
-choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg)
-{
-	u64 slice;
-	unsigned count;
-	struct cfq_rb_root *st;
-	u64 group_slice;
-	enum wl_class_t original_class = cfqd->serving_wl_class;
-	u64 now = ktime_get_ns();
-
-	/* Choose next priority. RT > BE > IDLE */
-	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_wl_class = RT_WORKLOAD;
-	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
-		cfqd->serving_wl_class = BE_WORKLOAD;
-	else {
-		cfqd->serving_wl_class = IDLE_WORKLOAD;
-		cfqd->workload_expires = now + jiffies_to_nsecs(1);
-		return;
-	}
-
-	if (original_class != cfqd->serving_wl_class)
-		goto new_workload;
-
-	/*
-	 * For RT and BE, we have to choose also the type
-	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
-	 * expiration time
-	 */
-	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
-	count = st->count;
-
-	/*
-	 * check workload expiration, and that we still have other queues ready
-	 */
-	if (count && !(now > cfqd->workload_expires))
-		return;
-
-new_workload:
-	/* otherwise select new workload type */
-	cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg,
-					cfqd->serving_wl_class);
-	st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type);
-	count = st->count;
-
-	/*
-	 * the workload slice is computed as a fraction of target latency
-	 * proportional to the number of queues in that workload, over
-	 * all the queues in the same priority class
-	 */
-	group_slice = cfq_group_slice(cfqd, cfqg);
-
-	slice = div_u64(group_slice * count,
-		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class],
-		      cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd,
-					cfqg)));
-
-	if (cfqd->serving_wl_type == ASYNC_WORKLOAD) {
-		u64 tmp;
-
-		/*
-		 * Async queues are currently system wide. Just taking
-		 * proportion of queues with-in same group will lead to higher
-		 * async ratio system wide as generally root group is going
-		 * to have higher weight. A more accurate thing would be to
-		 * calculate system wide asnc/sync ratio.
-		 */
-		tmp = cfqd->cfq_target_latency *
-			cfqg_busy_async_queues(cfqd, cfqg);
-		tmp = div_u64(tmp, cfqd->busy_queues);
-		slice = min_t(u64, slice, tmp);
-
-		/* async workload slice is scaled down according to
-		 * the sync/async slice ratio. */
-		slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]);
-	} else
-		/* sync workload slice is at least 2 * cfq_slice_idle */
-		slice = max(slice, 2 * cfqd->cfq_slice_idle);
-
-	slice = max_t(u64, slice, CFQ_MIN_TT);
-	cfq_log(cfqd, "workload slice:%llu", slice);
-	cfqd->workload_expires = now + slice;
-}
-
-static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
-{
-	struct cfq_rb_root *st = &cfqd->grp_service_tree;
-	struct cfq_group *cfqg;
-
-	if (RB_EMPTY_ROOT(&st->rb.rb_root))
-		return NULL;
-	cfqg = cfq_rb_first_group(st);
-	update_min_vdisktime(st);
-	return cfqg;
-}
-
-static void cfq_choose_cfqg(struct cfq_data *cfqd)
-{
-	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
-	u64 now = ktime_get_ns();
-
-	cfqd->serving_group = cfqg;
-
-	/* Restore the workload type data */
-	if (cfqg->saved_wl_slice) {
-		cfqd->workload_expires = now + cfqg->saved_wl_slice;
-		cfqd->serving_wl_type = cfqg->saved_wl_type;
-		cfqd->serving_wl_class = cfqg->saved_wl_class;
-	} else
-		cfqd->workload_expires = now - 1;
-
-	choose_wl_class_and_type(cfqd, cfqg);
-}
-
-/*
- * Select a queue for service. If we have a current active queue,
- * check whether to continue servicing it, or retrieve and set a new one.
- */
-static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	u64 now = ktime_get_ns();
-
-	cfqq = cfqd->active_queue;
-	if (!cfqq)
-		goto new_queue;
-
-	if (!cfqd->rq_queued)
-		return NULL;
-
-	/*
-	 * We were waiting for group to get backlogged. Expire the queue
-	 */
-	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
-		goto expire;
-
-	/*
-	 * The active queue has run out of time, expire it and select new.
-	 */
-	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
-		/*
-		 * If slice had not expired at the completion of last request
-		 * we might not have turned on wait_busy flag. Don't expire
-		 * the queue yet. Allow the group to get backlogged.
-		 *
-		 * The very fact that we have used the slice, that means we
-		 * have been idling all along on this queue and it should be
-		 * ok to wait for this request to complete.
-		 */
-		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
-		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
-			cfqq = NULL;
-			goto keep_queue;
-		} else
-			goto check_group_idle;
-	}
-
-	/*
-	 * The active queue has requests and isn't expired, allow it to
-	 * dispatch.
-	 */
-	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
-		goto keep_queue;
-
-	/*
-	 * If another queue has a request waiting within our mean seek
-	 * distance, let it run.  The expire code will check for close
-	 * cooperators and put the close queue at the front of the service
-	 * tree.  If possible, merge the expiring queue with the new cfqq.
-	 */
-	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
-	if (new_cfqq) {
-		if (!cfqq->new_cfqq)
-			cfq_setup_merge(cfqq, new_cfqq);
-		goto expire;
-	}
-
-	/*
-	 * No requests pending. If the active queue still has requests in
-	 * flight or is idling for a new request, allow either of these
-	 * conditions to happen (or time out) before selecting a new queue.
-	 */
-	if (hrtimer_active(&cfqd->idle_slice_timer)) {
-		cfqq = NULL;
-		goto keep_queue;
-	}
-
-	/*
-	 * This is a deep seek queue, but the device is much faster than
-	 * the queue can deliver, don't idle
-	 **/
-	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
-	    (cfq_cfqq_slice_new(cfqq) ||
-	    (cfqq->slice_end - now > now - cfqq->slice_start))) {
-		cfq_clear_cfqq_deep(cfqq);
-		cfq_clear_cfqq_idle_window(cfqq);
-	}
-
-	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
-		cfqq = NULL;
-		goto keep_queue;
-	}
-
-	/*
-	 * If group idle is enabled and there are requests dispatched from
-	 * this group, wait for requests to complete.
-	 */
-check_group_idle:
-	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
-	    cfqq->cfqg->dispatched &&
-	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
-		cfqq = NULL;
-		goto keep_queue;
-	}
-
-expire:
-	cfq_slice_expired(cfqd, 0);
-new_queue:
-	/*
-	 * Current queue expired. Check if we have to switch to a new
-	 * service tree
-	 */
-	if (!new_cfqq)
-		cfq_choose_cfqg(cfqd);
-
-	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
-keep_queue:
-	return cfqq;
-}
-
-static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
-{
-	int dispatched = 0;
-
-	while (cfqq->next_rq) {
-		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
-		dispatched++;
-	}
-
-	BUG_ON(!list_empty(&cfqq->fifo));
-
-	/* By default cfqq is not expired if it is empty. Do it explicitly */
-	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
-	return dispatched;
-}
-
-/*
- * Drain our current requests. Used for barriers and when switching
- * io schedulers on-the-fly.
- */
-static int cfq_forced_dispatch(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq;
-	int dispatched = 0;
-
-	/* Expire the timeslice of the current active queue first */
-	cfq_slice_expired(cfqd, 0);
-	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
-		__cfq_set_active_queue(cfqd, cfqq);
-		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
-	}
-
-	BUG_ON(cfqd->busy_queues);
-
-	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
-	return dispatched;
-}
-
-static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
-	struct cfq_queue *cfqq)
-{
-	u64 now = ktime_get_ns();
-
-	/* the queue hasn't finished any request, can't estimate */
-	if (cfq_cfqq_slice_new(cfqq))
-		return true;
-	if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end)
-		return true;
-
-	return false;
-}
-
-static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	unsigned int max_dispatch;
-
-	if (cfq_cfqq_must_dispatch(cfqq))
-		return true;
-
-	/*
-	 * Drain async requests before we start sync IO
-	 */
-	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
-		return false;
-
-	/*
-	 * If this is an async queue and we have sync IO in flight, let it wait
-	 */
-	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
-		return false;
-
-	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
-	if (cfq_class_idle(cfqq))
-		max_dispatch = 1;
-
-	/*
-	 * Does this cfqq already have too much IO in flight?
-	 */
-	if (cfqq->dispatched >= max_dispatch) {
-		bool promote_sync = false;
-		/*
-		 * idle queue must always only have a single IO in flight
-		 */
-		if (cfq_class_idle(cfqq))
-			return false;
-
-		/*
-		 * If there is only one sync queue
-		 * we can ignore async queue here and give the sync
-		 * queue no dispatch limit. The reason is a sync queue can
-		 * preempt async queue, limiting the sync queue doesn't make
-		 * sense. This is useful for aiostress test.
-		 */
-		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
-			promote_sync = true;
-
-		/*
-		 * We have other queues, don't allow more IO from this one
-		 */
-		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
-				!promote_sync)
-			return false;
-
-		/*
-		 * Sole queue user, no limit
-		 */
-		if (cfqd->busy_queues == 1 || promote_sync)
-			max_dispatch = -1;
-		else
-			/*
-			 * Normally we start throttling cfqq when cfq_quantum/2
-			 * requests have been dispatched. But we can drive
-			 * deeper queue depths at the beginning of slice
-			 * subjected to upper limit of cfq_quantum.
-			 * */
-			max_dispatch = cfqd->cfq_quantum;
-	}
-
-	/*
-	 * Async queues must wait a bit before being allowed dispatch.
-	 * We also ramp up the dispatch depth gradually for async IO,
-	 * based on the last sync IO we serviced
-	 */
-	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
-		u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync;
-		unsigned int depth;
-
-		depth = div64_u64(last_sync, cfqd->cfq_slice[1]);
-		if (!depth && !cfqq->dispatched)
-			depth = 1;
-		if (depth < max_dispatch)
-			max_dispatch = depth;
-	}
-
-	/*
-	 * If we're below the current max, allow a dispatch
-	 */
-	return cfqq->dispatched < max_dispatch;
-}
-
-/*
- * Dispatch a request from cfqq, moving them to the request queue
- * dispatch list.
- */
-static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	struct request *rq;
-
-	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
-
-	rq = cfq_check_fifo(cfqq);
-	if (rq)
-		cfq_mark_cfqq_must_dispatch(cfqq);
-
-	if (!cfq_may_dispatch(cfqd, cfqq))
-		return false;
-
-	/*
-	 * follow expired path, else get first next available
-	 */
-	if (!rq)
-		rq = cfqq->next_rq;
-	else
-		cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
-
-	/*
-	 * insert request into driver dispatch list
-	 */
-	cfq_dispatch_insert(cfqd->queue, rq);
-
-	if (!cfqd->active_cic) {
-		struct cfq_io_cq *cic = RQ_CIC(rq);
-
-		atomic_long_inc(&cic->icq.ioc->refcount);
-		cfqd->active_cic = cic;
-	}
-
-	return true;
-}
-
-/*
- * Find the cfqq that we need to service and move a request from that to the
- * dispatch list
- */
-static int cfq_dispatch_requests(struct request_queue *q, int force)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq;
-
-	if (!cfqd->busy_queues)
-		return 0;
-
-	if (unlikely(force))
-		return cfq_forced_dispatch(cfqd);
-
-	cfqq = cfq_select_queue(cfqd);
-	if (!cfqq)
-		return 0;
-
-	/*
-	 * Dispatch a request from this cfqq, if it is allowed
-	 */
-	if (!cfq_dispatch_request(cfqd, cfqq))
-		return 0;
-
-	cfqq->slice_dispatch++;
-	cfq_clear_cfqq_must_dispatch(cfqq);
-
-	/*
-	 * expire an async queue immediately if it has used up its slice. idle
-	 * queue always expire after 1 dispatch round.
-	 */
-	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
-	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
-	    cfq_class_idle(cfqq))) {
-		cfqq->slice_end = ktime_get_ns() + 1;
-		cfq_slice_expired(cfqd, 0);
-	}
-
-	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
-	return 1;
-}
-
-/*
- * task holds one reference to the queue, dropped when task exits. each rq
- * in-flight on this queue also holds a reference, dropped when rq is freed.
- *
- * Each cfq queue took a reference on the parent group. Drop it now.
- * queue lock must be held here.
- */
-static void cfq_put_queue(struct cfq_queue *cfqq)
-{
-	struct cfq_data *cfqd = cfqq->cfqd;
-	struct cfq_group *cfqg;
-
-	BUG_ON(cfqq->ref <= 0);
-
-	cfqq->ref--;
-	if (cfqq->ref)
-		return;
-
-	cfq_log_cfqq(cfqd, cfqq, "put_queue");
-	BUG_ON(rb_first(&cfqq->sort_list));
-	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
-	cfqg = cfqq->cfqg;
-
-	if (unlikely(cfqd->active_queue == cfqq)) {
-		__cfq_slice_expired(cfqd, cfqq, 0);
-		cfq_schedule_dispatch(cfqd);
-	}
-
-	BUG_ON(cfq_cfqq_on_rr(cfqq));
-	kmem_cache_free(cfq_pool, cfqq);
-	cfqg_put(cfqg);
-}
-
-static void cfq_put_cooperator(struct cfq_queue *cfqq)
-{
-	struct cfq_queue *__cfqq, *next;
-
-	/*
-	 * If this queue was scheduled to merge with another queue, be
-	 * sure to drop the reference taken on that queue (and others in
-	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
-	 */
-	__cfqq = cfqq->new_cfqq;
-	while (__cfqq) {
-		if (__cfqq == cfqq) {
-			WARN(1, "cfqq->new_cfqq loop detected\n");
-			break;
-		}
-		next = __cfqq->new_cfqq;
-		cfq_put_queue(__cfqq);
-		__cfqq = next;
-	}
-}
-
-static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	if (unlikely(cfqq == cfqd->active_queue)) {
-		__cfq_slice_expired(cfqd, cfqq, 0);
-		cfq_schedule_dispatch(cfqd);
-	}
-
-	cfq_put_cooperator(cfqq);
-
-	cfq_put_queue(cfqq);
-}
-
-static void cfq_init_icq(struct io_cq *icq)
-{
-	struct cfq_io_cq *cic = icq_to_cic(icq);
-
-	cic->ttime.last_end_request = ktime_get_ns();
-}
-
-static void cfq_exit_icq(struct io_cq *icq)
-{
-	struct cfq_io_cq *cic = icq_to_cic(icq);
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cic_to_cfqq(cic, false)) {
-		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false));
-		cic_set_cfqq(cic, NULL, false);
-	}
-
-	if (cic_to_cfqq(cic, true)) {
-		cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true));
-		cic_set_cfqq(cic, NULL, true);
-	}
-}
-
-static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
-{
-	struct task_struct *tsk = current;
-	int ioprio_class;
-
-	if (!cfq_cfqq_prio_changed(cfqq))
-		return;
-
-	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
-	switch (ioprio_class) {
-	default:
-		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
-		/* fall through */
-	case IOPRIO_CLASS_NONE:
-		/*
-		 * no prio set, inherit CPU scheduling settings
-		 */
-		cfqq->ioprio = task_nice_ioprio(tsk);
-		cfqq->ioprio_class = task_nice_ioclass(tsk);
-		break;
-	case IOPRIO_CLASS_RT:
-		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-		cfqq->ioprio_class = IOPRIO_CLASS_RT;
-		break;
-	case IOPRIO_CLASS_BE:
-		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-		cfqq->ioprio_class = IOPRIO_CLASS_BE;
-		break;
-	case IOPRIO_CLASS_IDLE:
-		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
-		cfqq->ioprio = 7;
-		cfq_clear_cfqq_idle_window(cfqq);
-		break;
-	}
-
-	/*
-	 * keep track of original prio settings in case we have to temporarily
-	 * elevate the priority of this queue
-	 */
-	cfqq->org_ioprio = cfqq->ioprio;
-	cfqq->org_ioprio_class = cfqq->ioprio_class;
-	cfq_clear_cfqq_prio_changed(cfqq);
-}
-
-static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
-{
-	int ioprio = cic->icq.ioc->ioprio;
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct cfq_queue *cfqq;
-
-	/*
-	 * Check whether ioprio has changed.  The condition may trigger
-	 * spuriously on a newly created cic but there's no harm.
-	 */
-	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
-		return;
-
-	cfqq = cic_to_cfqq(cic, false);
-	if (cfqq) {
-		cfq_put_queue(cfqq);
-		cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
-		cic_set_cfqq(cic, cfqq, false);
-	}
-
-	cfqq = cic_to_cfqq(cic, true);
-	if (cfqq)
-		cfq_mark_cfqq_prio_changed(cfqq);
-
-	cic->ioprio = ioprio;
-}
-
-static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			  pid_t pid, bool is_sync)
-{
-	RB_CLEAR_NODE(&cfqq->rb_node);
-	RB_CLEAR_NODE(&cfqq->p_node);
-	INIT_LIST_HEAD(&cfqq->fifo);
-
-	cfqq->ref = 0;
-	cfqq->cfqd = cfqd;
-
-	cfq_mark_cfqq_prio_changed(cfqq);
-
-	if (is_sync) {
-		if (!cfq_class_idle(cfqq))
-			cfq_mark_cfqq_idle_window(cfqq);
-		cfq_mark_cfqq_sync(cfqq);
-	}
-	cfqq->pid = pid;
-}
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	struct cfq_queue *cfqq;
-	uint64_t serial_nr;
-
-	rcu_read_lock();
-	serial_nr = bio_blkcg(bio)->css.serial_nr;
-	rcu_read_unlock();
-
-	/*
-	 * Check whether blkcg has changed.  The condition may trigger
-	 * spuriously on a newly created cic but there's no harm.
-	 */
-	if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr))
-		return;
-
-	/*
-	 * Drop reference to queues.  New queues will be assigned in new
-	 * group upon arrival of fresh requests.
-	 */
-	cfqq = cic_to_cfqq(cic, false);
-	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
-		cic_set_cfqq(cic, NULL, false);
-		cfq_put_queue(cfqq);
-	}
-
-	cfqq = cic_to_cfqq(cic, true);
-	if (cfqq) {
-		cfq_log_cfqq(cfqd, cfqq, "changed cgroup");
-		cic_set_cfqq(cic, NULL, true);
-		cfq_put_queue(cfqq);
-	}
-
-	cic->blkcg_serial_nr = serial_nr;
-}
-#else
-static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
-{
-}
-#endif  /* CONFIG_CFQ_GROUP_IOSCHED */
-
-static struct cfq_queue **
-cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio)
-{
-	switch (ioprio_class) {
-	case IOPRIO_CLASS_RT:
-		return &cfqg->async_cfqq[0][ioprio];
-	case IOPRIO_CLASS_NONE:
-		ioprio = IOPRIO_NORM;
-		/* fall through */
-	case IOPRIO_CLASS_BE:
-		return &cfqg->async_cfqq[1][ioprio];
-	case IOPRIO_CLASS_IDLE:
-		return &cfqg->async_idle_cfqq;
-	default:
-		BUG();
-	}
-}
-
-static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-	      struct bio *bio)
-{
-	int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
-	int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-	struct cfq_queue **async_cfqq = NULL;
-	struct cfq_queue *cfqq;
-	struct cfq_group *cfqg;
-
-	rcu_read_lock();
-	cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio));
-	if (!cfqg) {
-		cfqq = &cfqd->oom_cfqq;
-		goto out;
-	}
-
-	if (!is_sync) {
-		if (!ioprio_valid(cic->ioprio)) {
-			struct task_struct *tsk = current;
-			ioprio = task_nice_ioprio(tsk);
-			ioprio_class = task_nice_ioclass(tsk);
-		}
-		async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio);
-		cfqq = *async_cfqq;
-		if (cfqq)
-			goto out;
-	}
-
-	cfqq = kmem_cache_alloc_node(cfq_pool,
-				     GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
-				     cfqd->queue->node);
-	if (!cfqq) {
-		cfqq = &cfqd->oom_cfqq;
-		goto out;
-	}
-
-	/* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */
-	cfqq->ioprio_class = IOPRIO_CLASS_NONE;
-	cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
-	cfq_init_prio_data(cfqq, cic);
-	cfq_link_cfqq_cfqg(cfqq, cfqg);
-	cfq_log_cfqq(cfqd, cfqq, "alloced");
-
-	if (async_cfqq) {
-		/* a new async queue is created, pin and remember */
-		cfqq->ref++;
-		*async_cfqq = cfqq;
-	}
-out:
-	cfqq->ref++;
-	rcu_read_unlock();
-	return cfqq;
-}
-
-static void
-__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle)
-{
-	u64 elapsed = ktime_get_ns() - ttime->last_end_request;
-	elapsed = min(elapsed, 2UL * slice_idle);
-
-	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
-	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
-	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
-				     ttime->ttime_samples);
-}
-
-static void
-cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-			struct cfq_io_cq *cic)
-{
-	if (cfq_cfqq_sync(cfqq)) {
-		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
-		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
-			cfqd->cfq_slice_idle);
-	}
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
-#endif
-}
-
-static void
-cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		       struct request *rq)
-{
-	sector_t sdist = 0;
-	sector_t n_sec = blk_rq_sectors(rq);
-	if (cfqq->last_request_pos) {
-		if (cfqq->last_request_pos < blk_rq_pos(rq))
-			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
-		else
-			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
-	}
-
-	cfqq->seek_history <<= 1;
-	if (blk_queue_nonrot(cfqd->queue))
-		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
-	else
-		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
-}
-
-static inline bool req_noidle(struct request *req)
-{
-	return req_op(req) == REQ_OP_WRITE &&
-		(req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC;
-}
-
-/*
- * Disable idle window if the process thinks too long or seeks so much that
- * it doesn't matter
- */
-static void
-cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		       struct cfq_io_cq *cic)
-{
-	int old_idle, enable_idle;
-
-	/*
-	 * Don't idle for async or idle io prio class
-	 */
-	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
-		return;
-
-	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
-
-	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
-		cfq_mark_cfqq_deep(cfqq);
-
-	if (cfqq->next_rq && req_noidle(cfqq->next_rq))
-		enable_idle = 0;
-	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
-		 !cfqd->cfq_slice_idle ||
-		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
-		enable_idle = 0;
-	else if (sample_valid(cic->ttime.ttime_samples)) {
-		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
-			enable_idle = 0;
-		else
-			enable_idle = 1;
-	}
-
-	if (old_idle != enable_idle) {
-		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
-		if (enable_idle)
-			cfq_mark_cfqq_idle_window(cfqq);
-		else
-			cfq_clear_cfqq_idle_window(cfqq);
-	}
-}
-
-/*
- * Check if new_cfqq should preempt the currently active queue. Return 0 for
- * no or if we aren't sure, a 1 will cause a preempt.
- */
-static bool
-cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
-		   struct request *rq)
-{
-	struct cfq_queue *cfqq;
-
-	cfqq = cfqd->active_queue;
-	if (!cfqq)
-		return false;
-
-	if (cfq_class_idle(new_cfqq))
-		return false;
-
-	if (cfq_class_idle(cfqq))
-		return true;
-
-	/*
-	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
-	 */
-	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
-		return false;
-
-	/*
-	 * if the new request is sync, but the currently running queue is
-	 * not, let the sync request have priority.
-	 */
-	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
-		return true;
-
-	/*
-	 * Treat ancestors of current cgroup the same way as current cgroup.
-	 * For anybody else we disallow preemption to guarantee service
-	 * fairness among cgroups.
-	 */
-	if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg))
-		return false;
-
-	if (cfq_slice_used(cfqq))
-		return true;
-
-	/*
-	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
-	 */
-	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
-		return true;
-
-	WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class);
-	/* Allow preemption only if we are idling on sync-noidle tree */
-	if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD &&
-	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
-	    RB_EMPTY_ROOT(&cfqq->sort_list))
-		return true;
-
-	/*
-	 * So both queues are sync. Let the new request get disk time if
-	 * it's a metadata request and the current queue is doing regular IO.
-	 */
-	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
-		return true;
-
-	/* An idle queue should not be idle now for some reason */
-	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
-		return true;
-
-	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
-		return false;
-
-	/*
-	 * if this request is as-good as one we would expect from the
-	 * current cfqq, let it preempt
-	 */
-	if (cfq_rq_close(cfqd, cfqq, rq))
-		return true;
-
-	return false;
-}
-
-/*
- * cfqq preempts the active queue. if we allowed preempt with no slice left,
- * let it have half of its nominal slice.
- */
-static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
-
-	cfq_log_cfqq(cfqd, cfqq, "preempt");
-	cfq_slice_expired(cfqd, 1);
-
-	/*
-	 * workload type is changed, don't save slice, otherwise preempt
-	 * doesn't happen
-	 */
-	if (old_type != cfqq_type(cfqq))
-		cfqq->cfqg->saved_wl_slice = 0;
-
-	/*
-	 * Put the new queue at the front of the of the current list,
-	 * so we know that it will be selected next.
-	 */
-	BUG_ON(!cfq_cfqq_on_rr(cfqq));
-
-	cfq_service_tree_add(cfqd, cfqq, 1);
-
-	cfqq->slice_end = 0;
-	cfq_mark_cfqq_slice_new(cfqq);
-}
-
-/*
- * Called when a new fs request (rq) is added (to cfqq). Check if there's
- * something we should do about it
- */
-static void
-cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		struct request *rq)
-{
-	struct cfq_io_cq *cic = RQ_CIC(rq);
-
-	cfqd->rq_queued++;
-	if (rq->cmd_flags & REQ_PRIO)
-		cfqq->prio_pending++;
-
-	cfq_update_io_thinktime(cfqd, cfqq, cic);
-	cfq_update_io_seektime(cfqd, cfqq, rq);
-	cfq_update_idle_window(cfqd, cfqq, cic);
-
-	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
-
-	if (cfqq == cfqd->active_queue) {
-		/*
-		 * Remember that we saw a request from this process, but
-		 * don't start queuing just yet. Otherwise we risk seeing lots
-		 * of tiny requests, because we disrupt the normal plugging
-		 * and merging. If the request is already larger than a single
-		 * page, let it rip immediately. For that case we assume that
-		 * merging is already done. Ditto for a busy system that
-		 * has other work pending, don't risk delaying until the
-		 * idle timer unplug to continue working.
-		 */
-		if (cfq_cfqq_wait_request(cfqq)) {
-			if (blk_rq_bytes(rq) > PAGE_SIZE ||
-			    cfqd->busy_queues > 1) {
-				cfq_del_timer(cfqd, cfqq);
-				cfq_clear_cfqq_wait_request(cfqq);
-				__blk_run_queue(cfqd->queue);
-			} else {
-				cfqg_stats_update_idle_time(cfqq->cfqg);
-				cfq_mark_cfqq_must_dispatch(cfqq);
-			}
-		}
-	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
-		/*
-		 * not the active queue - expire current slice if it is
-		 * idle and has expired it's mean thinktime or this new queue
-		 * has some old slice time left and is of higher priority or
-		 * this new queue is RT and the current one is BE
-		 */
-		cfq_preempt_queue(cfqd, cfqq);
-		__blk_run_queue(cfqd->queue);
-	}
-}
-
-static void cfq_insert_request(struct request_queue *q, struct request *rq)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-
-	cfq_log_cfqq(cfqd, cfqq, "insert_request");
-	cfq_init_prio_data(cfqq, RQ_CIC(rq));
-
-	rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)];
-	list_add_tail(&rq->queuelist, &cfqq->fifo);
-	cfq_add_rq_rb(rq);
-	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
-				 rq->cmd_flags);
-	cfq_rq_enqueued(cfqd, cfqq, rq);
-}
-
-/*
- * Update hw_tag based on peak queue depth over 50 samples under
- * sufficient load.
- */
-static void cfq_update_hw_tag(struct cfq_data *cfqd)
-{
-	struct cfq_queue *cfqq = cfqd->active_queue;
-
-	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
-		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
-
-	if (cfqd->hw_tag == 1)
-		return;
-
-	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
-	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
-		return;
-
-	/*
-	 * If active queue hasn't enough requests and can idle, cfq might not
-	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
-	 * case
-	 */
-	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
-	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
-	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
-		return;
-
-	if (cfqd->hw_tag_samples++ < 50)
-		return;
-
-	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
-		cfqd->hw_tag = 1;
-	else
-		cfqd->hw_tag = 0;
-}
-
-static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
-{
-	struct cfq_io_cq *cic = cfqd->active_cic;
-	u64 now = ktime_get_ns();
-
-	/* If the queue already has requests, don't wait */
-	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
-		return false;
-
-	/* If there are other queues in the group, don't wait */
-	if (cfqq->cfqg->nr_cfqq > 1)
-		return false;
-
-	/* the only queue in the group, but think time is big */
-	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
-		return false;
-
-	if (cfq_slice_used(cfqq))
-		return true;
-
-	/* if slice left is less than think time, wait busy */
-	if (cic && sample_valid(cic->ttime.ttime_samples)
-	    && (cfqq->slice_end - now < cic->ttime.ttime_mean))
-		return true;
-
-	/*
-	 * If think times is less than a jiffy than ttime_mean=0 and above
-	 * will not be true. It might happen that slice has not expired yet
-	 * but will expire soon (4-5 ns) during select_queue(). To cover the
-	 * case where think time is less than a jiffy, mark the queue wait
-	 * busy if only 1 jiffy is left in the slice.
-	 */
-	if (cfqq->slice_end - now <= jiffies_to_nsecs(1))
-		return true;
-
-	return false;
-}
-
-static void cfq_completed_request(struct request_queue *q, struct request *rq)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-	struct cfq_data *cfqd = cfqq->cfqd;
-	const int sync = rq_is_sync(rq);
-	u64 now = ktime_get_ns();
-
-	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq));
-
-	cfq_update_hw_tag(cfqd);
-
-	WARN_ON(!cfqd->rq_in_driver);
-	WARN_ON(!cfqq->dispatched);
-	cfqd->rq_in_driver--;
-	cfqq->dispatched--;
-	(RQ_CFQG(rq))->dispatched--;
-	cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
-				     rq->io_start_time_ns, rq->cmd_flags);
-
-	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
-
-	if (sync) {
-		struct cfq_rb_root *st;
-
-		RQ_CIC(rq)->ttime.last_end_request = now;
-
-		if (cfq_cfqq_on_rr(cfqq))
-			st = cfqq->service_tree;
-		else
-			st = st_for(cfqq->cfqg, cfqq_class(cfqq),
-					cfqq_type(cfqq));
-
-		st->ttime.last_end_request = now;
-		if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now)
-			cfqd->last_delayed_sync = now;
-	}
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	cfqq->cfqg->ttime.last_end_request = now;
-#endif
-
-	/*
-	 * If this is the active queue, check if it needs to be expired,
-	 * or if we want to idle in case it has no pending requests.
-	 */
-	if (cfqd->active_queue == cfqq) {
-		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
-
-		if (cfq_cfqq_slice_new(cfqq)) {
-			cfq_set_prio_slice(cfqd, cfqq);
-			cfq_clear_cfqq_slice_new(cfqq);
-		}
-
-		/*
-		 * Should we wait for next request to come in before we expire
-		 * the queue.
-		 */
-		if (cfq_should_wait_busy(cfqd, cfqq)) {
-			u64 extend_sl = cfqd->cfq_slice_idle;
-			if (!cfqd->cfq_slice_idle)
-				extend_sl = cfqd->cfq_group_idle;
-			cfqq->slice_end = now + extend_sl;
-			cfq_mark_cfqq_wait_busy(cfqq);
-			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
-		}
-
-		/*
-		 * Idling is not enabled on:
-		 * - expired queues
-		 * - idle-priority queues
-		 * - async queues
-		 * - queues with still some requests queued
-		 * - when there is a close cooperator
-		 */
-		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
-			cfq_slice_expired(cfqd, 1);
-		else if (sync && cfqq_empty &&
-			 !cfq_close_cooperator(cfqd, cfqq)) {
-			cfq_arm_slice_timer(cfqd);
-		}
-	}
-
-	if (!cfqd->rq_in_driver)
-		cfq_schedule_dispatch(cfqd);
-}
-
-static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op)
-{
-	/*
-	 * If REQ_PRIO is set, boost class and prio level, if it's below
-	 * BE/NORM. If prio is not set, restore the potentially boosted
-	 * class/prio level.
-	 */
-	if (!(op & REQ_PRIO)) {
-		cfqq->ioprio_class = cfqq->org_ioprio_class;
-		cfqq->ioprio = cfqq->org_ioprio;
-	} else {
-		if (cfq_class_idle(cfqq))
-			cfqq->ioprio_class = IOPRIO_CLASS_BE;
-		if (cfqq->ioprio > IOPRIO_NORM)
-			cfqq->ioprio = IOPRIO_NORM;
-	}
-}
-
-static inline int __cfq_may_queue(struct cfq_queue *cfqq)
-{
-	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
-		cfq_mark_cfqq_must_alloc_slice(cfqq);
-		return ELV_MQUEUE_MUST;
-	}
-
-	return ELV_MQUEUE_MAY;
-}
-
-static int cfq_may_queue(struct request_queue *q, unsigned int op)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct task_struct *tsk = current;
-	struct cfq_io_cq *cic;
-	struct cfq_queue *cfqq;
-
-	/*
-	 * don't force setup of a queue from here, as a call to may_queue
-	 * does not necessarily imply that a request actually will be queued.
-	 * so just lookup a possibly existing queue, or return 'may queue'
-	 * if that fails
-	 */
-	cic = cfq_cic_lookup(cfqd, tsk->io_context);
-	if (!cic)
-		return ELV_MQUEUE_MAY;
-
-	cfqq = cic_to_cfqq(cic, op_is_sync(op));
-	if (cfqq) {
-		cfq_init_prio_data(cfqq, cic);
-		cfqq_boost_on_prio(cfqq, op);
-
-		return __cfq_may_queue(cfqq);
-	}
-
-	return ELV_MQUEUE_MAY;
-}
-
-/*
- * queue lock held here
- */
-static void cfq_put_request(struct request *rq)
-{
-	struct cfq_queue *cfqq = RQ_CFQQ(rq);
-
-	if (cfqq) {
-		const int rw = rq_data_dir(rq);
-
-		BUG_ON(!cfqq->allocated[rw]);
-		cfqq->allocated[rw]--;
-
-		/* Put down rq reference on cfqg */
-		cfqg_put(RQ_CFQG(rq));
-		rq->elv.priv[0] = NULL;
-		rq->elv.priv[1] = NULL;
-
-		cfq_put_queue(cfqq);
-	}
-}
-
-static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
-		struct cfq_queue *cfqq)
-{
-	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
-	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
-	cfq_mark_cfqq_coop(cfqq->new_cfqq);
-	cfq_put_queue(cfqq);
-	return cic_to_cfqq(cic, 1);
-}
-
-/*
- * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
- * was the last process referring to said cfqq.
- */
-static struct cfq_queue *
-split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
-{
-	if (cfqq_process_refs(cfqq) == 1) {
-		cfqq->pid = current->pid;
-		cfq_clear_cfqq_coop(cfqq);
-		cfq_clear_cfqq_split_coop(cfqq);
-		return cfqq;
-	}
-
-	cic_set_cfqq(cic, NULL, 1);
-
-	cfq_put_cooperator(cfqq);
-
-	cfq_put_queue(cfqq);
-	return NULL;
-}
-/*
- * Allocate cfq data structures associated with this request.
- */
-static int
-cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
-		gfp_t gfp_mask)
-{
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
-	const int rw = rq_data_dir(rq);
-	const bool is_sync = rq_is_sync(rq);
-	struct cfq_queue *cfqq;
-
-	spin_lock_irq(q->queue_lock);
-
-	check_ioprio_changed(cic, bio);
-	check_blkcg_changed(cic, bio);
-new_queue:
-	cfqq = cic_to_cfqq(cic, is_sync);
-	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		if (cfqq)
-			cfq_put_queue(cfqq);
-		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
-		cic_set_cfqq(cic, cfqq, is_sync);
-	} else {
-		/*
-		 * If the queue was seeky for too long, break it apart.
-		 */
-		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
-			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
-			cfqq = split_cfqq(cic, cfqq);
-			if (!cfqq)
-				goto new_queue;
-		}
-
-		/*
-		 * Check to see if this queue is scheduled to merge with
-		 * another, closely cooperating queue.  The merging of
-		 * queues happens here as it must be done in process context.
-		 * The reference on new_cfqq was taken in merge_cfqqs.
-		 */
-		if (cfqq->new_cfqq)
-			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
-	}
-
-	cfqq->allocated[rw]++;
-
-	cfqq->ref++;
-	cfqg_get(cfqq->cfqg);
-	rq->elv.priv[0] = cfqq;
-	rq->elv.priv[1] = cfqq->cfqg;
-	spin_unlock_irq(q->queue_lock);
-
-	return 0;
-}
-
-static void cfq_kick_queue(struct work_struct *work)
-{
-	struct cfq_data *cfqd =
-		container_of(work, struct cfq_data, unplug_work);
-	struct request_queue *q = cfqd->queue;
-
-	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(cfqd->queue);
-	spin_unlock_irq(q->queue_lock);
-}
-
-/*
- * Timer running if the active_queue is currently idling inside its time slice
- */
-static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer)
-{
-	struct cfq_data *cfqd = container_of(timer, struct cfq_data,
-					     idle_slice_timer);
-	struct cfq_queue *cfqq;
-	unsigned long flags;
-	int timed_out = 1;
-
-	cfq_log(cfqd, "idle timer fired");
-
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
-	cfqq = cfqd->active_queue;
-	if (cfqq) {
-		timed_out = 0;
-
-		/*
-		 * We saw a request before the queue expired, let it through
-		 */
-		if (cfq_cfqq_must_dispatch(cfqq))
-			goto out_kick;
-
-		/*
-		 * expired
-		 */
-		if (cfq_slice_used(cfqq))
-			goto expire;
-
-		/*
-		 * only expire and reinvoke request handler, if there are
-		 * other queues with pending requests
-		 */
-		if (!cfqd->busy_queues)
-			goto out_cont;
-
-		/*
-		 * not expired and it has a request pending, let it dispatch
-		 */
-		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
-			goto out_kick;
-
-		/*
-		 * Queue depth flag is reset only when the idle didn't succeed
-		 */
-		cfq_clear_cfqq_deep(cfqq);
-	}
-expire:
-	cfq_slice_expired(cfqd, timed_out);
-out_kick:
-	cfq_schedule_dispatch(cfqd);
-out_cont:
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-	return HRTIMER_NORESTART;
-}
-
-static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
-{
-	hrtimer_cancel(&cfqd->idle_slice_timer);
-	cancel_work_sync(&cfqd->unplug_work);
-}
-
-static void cfq_exit_queue(struct elevator_queue *e)
-{
-	struct cfq_data *cfqd = e->elevator_data;
-	struct request_queue *q = cfqd->queue;
-
-	cfq_shutdown_timer_wq(cfqd);
-
-	spin_lock_irq(q->queue_lock);
-
-	if (cfqd->active_queue)
-		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
-
-	spin_unlock_irq(q->queue_lock);
-
-	cfq_shutdown_timer_wq(cfqd);
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	blkcg_deactivate_policy(q, &blkcg_policy_cfq);
-#else
-	kfree(cfqd->root_group);
-#endif
-	kfree(cfqd);
-}
-
-static int cfq_init_queue(struct request_queue *q, struct elevator_type *e)
-{
-	struct cfq_data *cfqd;
-	struct blkcg_gq *blkg __maybe_unused;
-	int i, ret;
-	struct elevator_queue *eq;
-
-	eq = elevator_alloc(q, e);
-	if (!eq)
-		return -ENOMEM;
-
-	cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node);
-	if (!cfqd) {
-		kobject_put(&eq->kobj);
-		return -ENOMEM;
-	}
-	eq->elevator_data = cfqd;
-
-	cfqd->queue = q;
-	spin_lock_irq(q->queue_lock);
-	q->elevator = eq;
-	spin_unlock_irq(q->queue_lock);
-
-	/* Init root service tree */
-	cfqd->grp_service_tree = CFQ_RB_ROOT;
-
-	/* Init root group and prefer root group over other groups by default */
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	ret = blkcg_activate_policy(q, &blkcg_policy_cfq);
-	if (ret)
-		goto out_free;
-
-	cfqd->root_group = blkg_to_cfqg(q->root_blkg);
-#else
-	ret = -ENOMEM;
-	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
-					GFP_KERNEL, cfqd->queue->node);
-	if (!cfqd->root_group)
-		goto out_free;
-
-	cfq_init_cfqg_base(cfqd->root_group);
-	cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-	cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL;
-#endif
-
-	/*
-	 * Not strictly needed (since RB_ROOT just clears the node and we
-	 * zeroed cfqd on alloc), but better be safe in case someone decides
-	 * to add magic to the rb code
-	 */
-	for (i = 0; i < CFQ_PRIO_LISTS; i++)
-		cfqd->prio_trees[i] = RB_ROOT;
-
-	/*
-	 * Our fallback cfqq if cfq_get_queue() runs into OOM issues.
-	 * Grab a permanent reference to it, so that the normal code flow
-	 * will not attempt to free it.  oom_cfqq is linked to root_group
-	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
-	 * the reference from linking right away.
-	 */
-	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-	cfqd->oom_cfqq.ref++;
-
-	spin_lock_irq(q->queue_lock);
-	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
-	cfqg_put(cfqd->root_group);
-	spin_unlock_irq(q->queue_lock);
-
-	hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC,
-		     HRTIMER_MODE_REL);
-	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
-
-	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
-
-	cfqd->cfq_quantum = cfq_quantum;
-	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
-	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
-	cfqd->cfq_back_max = cfq_back_max;
-	cfqd->cfq_back_penalty = cfq_back_penalty;
-	cfqd->cfq_slice[0] = cfq_slice_async;
-	cfqd->cfq_slice[1] = cfq_slice_sync;
-	cfqd->cfq_target_latency = cfq_target_latency;
-	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
-	cfqd->cfq_slice_idle = cfq_slice_idle;
-	cfqd->cfq_group_idle = cfq_group_idle;
-	cfqd->cfq_latency = 1;
-	cfqd->hw_tag = -1;
-	/*
-	 * we optimistically start assuming sync ops weren't delayed in last
-	 * second, in order to have larger depth for async operations.
-	 */
-	cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC;
-	return 0;
-
-out_free:
-	kfree(cfqd);
-	kobject_put(&eq->kobj);
-	return ret;
-}
-
-static void cfq_registered_queue(struct request_queue *q)
-{
-	struct elevator_queue *e = q->elevator;
-	struct cfq_data *cfqd = e->elevator_data;
-
-	/*
-	 * Default to IOPS mode with no idling for SSDs
-	 */
-	if (blk_queue_nonrot(q))
-		cfqd->cfq_slice_idle = 0;
-	wbt_disable_default(q);
-}
-
-/*
- * sysfs parts below -->
- */
-static ssize_t
-cfq_var_show(unsigned int var, char *page)
-{
-	return sprintf(page, "%u\n", var);
-}
-
-static void
-cfq_var_store(unsigned int *var, const char *page)
-{
-	char *p = (char *) page;
-
-	*var = simple_strtoul(p, &p, 10);
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct cfq_data *cfqd = e->elevator_data;			\
-	u64 __data = __VAR;						\
-	if (__CONV)							\
-		__data = div_u64(__data, NSEC_PER_MSEC);			\
-	return cfq_var_show(__data, (page));				\
-}
-SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
-SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
-SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
-SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
-SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
-SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
-SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
-SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
-SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
-SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
-SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
-SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1);
-#undef SHOW_FUNCTION
-
-#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct cfq_data *cfqd = e->elevator_data;			\
-	u64 __data = __VAR;						\
-	__data = div_u64(__data, NSEC_PER_USEC);			\
-	return cfq_var_show(__data, (page));				\
-}
-USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle);
-USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle);
-USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]);
-USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]);
-USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency);
-#undef USEC_SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct cfq_data *cfqd = e->elevator_data;			\
-	unsigned int __data, __min = (MIN), __max = (MAX);		\
-									\
-	cfq_var_store(&__data, (page));					\
-	if (__data < __min)						\
-		__data = __min;						\
-	else if (__data > __max)					\
-		__data = __max;						\
-	if (__CONV)							\
-		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
-	else								\
-		*(__PTR) = __data;					\
-	return count;							\
-}
-STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
-		UINT_MAX, 1);
-STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
-		UINT_MAX, 1);
-STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
-STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
-		UINT_MAX, 0);
-STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
-STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
-STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
-		UINT_MAX, 0);
-STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
-STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1);
-#undef STORE_FUNCTION
-
-#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct cfq_data *cfqd = e->elevator_data;			\
-	unsigned int __data, __min = (MIN), __max = (MAX);		\
-									\
-	cfq_var_store(&__data, (page));					\
-	if (__data < __min)						\
-		__data = __min;						\
-	else if (__data > __max)					\
-		__data = __max;						\
-	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
-	return count;							\
-}
-USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX);
-USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX);
-USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX);
-USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX);
-USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX);
-#undef USEC_STORE_FUNCTION
-
-#define CFQ_ATTR(name) \
-	__ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store)
-
-static struct elv_fs_entry cfq_attrs[] = {
-	CFQ_ATTR(quantum),
-	CFQ_ATTR(fifo_expire_sync),
-	CFQ_ATTR(fifo_expire_async),
-	CFQ_ATTR(back_seek_max),
-	CFQ_ATTR(back_seek_penalty),
-	CFQ_ATTR(slice_sync),
-	CFQ_ATTR(slice_sync_us),
-	CFQ_ATTR(slice_async),
-	CFQ_ATTR(slice_async_us),
-	CFQ_ATTR(slice_async_rq),
-	CFQ_ATTR(slice_idle),
-	CFQ_ATTR(slice_idle_us),
-	CFQ_ATTR(group_idle),
-	CFQ_ATTR(group_idle_us),
-	CFQ_ATTR(low_latency),
-	CFQ_ATTR(target_latency),
-	CFQ_ATTR(target_latency_us),
-	__ATTR_NULL
-};
-
-static struct elevator_type iosched_cfq = {
-	.ops.sq = {
-		.elevator_merge_fn = 		cfq_merge,
-		.elevator_merged_fn =		cfq_merged_request,
-		.elevator_merge_req_fn =	cfq_merged_requests,
-		.elevator_allow_bio_merge_fn =	cfq_allow_bio_merge,
-		.elevator_allow_rq_merge_fn =	cfq_allow_rq_merge,
-		.elevator_bio_merged_fn =	cfq_bio_merged,
-		.elevator_dispatch_fn =		cfq_dispatch_requests,
-		.elevator_add_req_fn =		cfq_insert_request,
-		.elevator_activate_req_fn =	cfq_activate_request,
-		.elevator_deactivate_req_fn =	cfq_deactivate_request,
-		.elevator_completed_req_fn =	cfq_completed_request,
-		.elevator_former_req_fn =	elv_rb_former_request,
-		.elevator_latter_req_fn =	elv_rb_latter_request,
-		.elevator_init_icq_fn =		cfq_init_icq,
-		.elevator_exit_icq_fn =		cfq_exit_icq,
-		.elevator_set_req_fn =		cfq_set_request,
-		.elevator_put_req_fn =		cfq_put_request,
-		.elevator_may_queue_fn =	cfq_may_queue,
-		.elevator_init_fn =		cfq_init_queue,
-		.elevator_exit_fn =		cfq_exit_queue,
-		.elevator_registered_fn =	cfq_registered_queue,
-	},
-	.icq_size	=	sizeof(struct cfq_io_cq),
-	.icq_align	=	__alignof__(struct cfq_io_cq),
-	.elevator_attrs =	cfq_attrs,
-	.elevator_name	=	"cfq",
-	.elevator_owner =	THIS_MODULE,
-};
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-static struct blkcg_policy blkcg_policy_cfq = {
-	.dfl_cftypes		= cfq_blkcg_files,
-	.legacy_cftypes		= cfq_blkcg_legacy_files,
-
-	.cpd_alloc_fn		= cfq_cpd_alloc,
-	.cpd_init_fn		= cfq_cpd_init,
-	.cpd_free_fn		= cfq_cpd_free,
-	.cpd_bind_fn		= cfq_cpd_bind,
-
-	.pd_alloc_fn		= cfq_pd_alloc,
-	.pd_init_fn		= cfq_pd_init,
-	.pd_offline_fn		= cfq_pd_offline,
-	.pd_free_fn		= cfq_pd_free,
-	.pd_reset_stats_fn	= cfq_pd_reset_stats,
-};
-#endif
-
-static int __init cfq_init(void)
-{
-	int ret;
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	ret = blkcg_policy_register(&blkcg_policy_cfq);
-	if (ret)
-		return ret;
-#else
-	cfq_group_idle = 0;
-#endif
-
-	ret = -ENOMEM;
-	cfq_pool = KMEM_CACHE(cfq_queue, 0);
-	if (!cfq_pool)
-		goto err_pol_unreg;
-
-	ret = elv_register(&iosched_cfq);
-	if (ret)
-		goto err_free_pool;
-
-	return 0;
-
-err_free_pool:
-	kmem_cache_destroy(cfq_pool);
-err_pol_unreg:
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_cfq);
-#endif
-	return ret;
-}
-
-static void __exit cfq_exit(void)
-{
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	blkcg_policy_unregister(&blkcg_policy_cfq);
-#endif
-	elv_unregister(&iosched_cfq);
-	kmem_cache_destroy(cfq_pool);
-}
-
-module_init(cfq_init);
-module_exit(cfq_exit);
-
-MODULE_AUTHOR("Jens Axboe");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
deleted file mode 100644
index ef2f1f09e9b3..000000000000
--- a/block/deadline-iosched.c
+++ /dev/null
@@ -1,560 +0,0 @@
-/*
- *  Deadline i/o scheduler.
- *
- *  Copyright (C) 2002 Jens Axboe <axboe@kernel.dk>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/rbtree.h>
-
-/*
- * See Documentation/block/deadline-iosched.txt
- */
-static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
-static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-static const int writes_starved = 2;    /* max times reads can starve a write */
-static const int fifo_batch = 16;       /* # of sequential requests treated as one
-				     by the above parameters. For throughput. */
-
-struct deadline_data {
-	/*
-	 * run time data
-	 */
-
-	/*
-	 * requests (deadline_rq s) are present on both sort_list and fifo_list
-	 */
-	struct rb_root sort_list[2];	
-	struct list_head fifo_list[2];
-
-	/*
-	 * next in sort order. read, write or both are NULL
-	 */
-	struct request *next_rq[2];
-	unsigned int batching;		/* number of sequential requests made */
-	unsigned int starved;		/* times reads have starved writes */
-
-	/*
-	 * settings that change how the i/o scheduler behaves
-	 */
-	int fifo_expire[2];
-	int fifo_batch;
-	int writes_starved;
-	int front_merges;
-};
-
-static inline struct rb_root *
-deadline_rb_root(struct deadline_data *dd, struct request *rq)
-{
-	return &dd->sort_list[rq_data_dir(rq)];
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
-	struct rb_node *node = rb_next(&rq->rb_node);
-
-	if (node)
-		return rb_entry_rq(node);
-
-	return NULL;
-}
-
-static void
-deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
-{
-	struct rb_root *root = deadline_rb_root(dd, rq);
-
-	elv_rb_add(root, rq);
-}
-
-static inline void
-deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
-{
-	const int data_dir = rq_data_dir(rq);
-
-	if (dd->next_rq[data_dir] == rq)
-		dd->next_rq[data_dir] = deadline_latter_request(rq);
-
-	elv_rb_del(deadline_rb_root(dd, rq), rq);
-}
-
-/*
- * add rq to rbtree and fifo
- */
-static void
-deadline_add_request(struct request_queue *q, struct request *rq)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-	const int data_dir = rq_data_dir(rq);
-
-	/*
-	 * This may be a requeue of a write request that has locked its
-	 * target zone. If it is the case, this releases the zone lock.
-	 */
-	blk_req_zone_write_unlock(rq);
-
-	deadline_add_rq_rb(dd, rq);
-
-	/*
-	 * set expire time and add to fifo list
-	 */
-	rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
-	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
-}
-
-/*
- * remove rq from rbtree and fifo.
- */
-static void deadline_remove_request(struct request_queue *q, struct request *rq)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-
-	rq_fifo_clear(rq);
-	deadline_del_rq_rb(dd, rq);
-}
-
-static enum elv_merge
-deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-	struct request *__rq;
-
-	/*
-	 * check for front merge
-	 */
-	if (dd->front_merges) {
-		sector_t sector = bio_end_sector(bio);
-
-		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
-		if (__rq) {
-			BUG_ON(sector != blk_rq_pos(__rq));
-
-			if (elv_bio_merge_ok(__rq, bio)) {
-				*req = __rq;
-				return ELEVATOR_FRONT_MERGE;
-			}
-		}
-	}
-
-	return ELEVATOR_NO_MERGE;
-}
-
-static void deadline_merged_request(struct request_queue *q,
-				    struct request *req, enum elv_merge type)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-
-	/*
-	 * if the merge was a front merge, we need to reposition request
-	 */
-	if (type == ELEVATOR_FRONT_MERGE) {
-		elv_rb_del(deadline_rb_root(dd, req), req);
-		deadline_add_rq_rb(dd, req);
-	}
-}
-
-static void
-deadline_merged_requests(struct request_queue *q, struct request *req,
-			 struct request *next)
-{
-	/*
-	 * if next expires before rq, assign its expire time to rq
-	 * and move into next position (next will be deleted) in fifo
-	 */
-	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
-		if (time_before((unsigned long)next->fifo_time,
-				(unsigned long)req->fifo_time)) {
-			list_move(&req->queuelist, &next->queuelist);
-			req->fifo_time = next->fifo_time;
-		}
-	}
-
-	/*
-	 * kill knowledge of next, this one is a goner
-	 */
-	deadline_remove_request(q, next);
-}
-
-/*
- * move request from sort list to dispatch queue.
- */
-static inline void
-deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	/*
-	 * For a zoned block device, write requests must write lock their
-	 * target zone.
-	 */
-	blk_req_zone_write_lock(rq);
-
-	deadline_remove_request(q, rq);
-	elv_dispatch_add_tail(q, rq);
-}
-
-/*
- * move an entry to dispatch queue
- */
-static void
-deadline_move_request(struct deadline_data *dd, struct request *rq)
-{
-	const int data_dir = rq_data_dir(rq);
-
-	dd->next_rq[READ] = NULL;
-	dd->next_rq[WRITE] = NULL;
-	dd->next_rq[data_dir] = deadline_latter_request(rq);
-
-	/*
-	 * take it off the sort and fifo list, move
-	 * to dispatch queue
-	 */
-	deadline_move_to_dispatch(dd, rq);
-}
-
-/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
- * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
- */
-static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
-{
-	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
-
-	/*
-	 * rq is expired!
-	 */
-	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
-		return 1;
-
-	return 0;
-}
-
-/*
- * For the specified data direction, return the next request to dispatch using
- * arrival ordered lists.
- */
-static struct request *
-deadline_fifo_request(struct deadline_data *dd, int data_dir)
-{
-	struct request *rq;
-
-	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
-		return NULL;
-
-	if (list_empty(&dd->fifo_list[data_dir]))
-		return NULL;
-
-	rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
-	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone.
-	 */
-	list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
-		if (blk_req_can_dispatch_to_zone(rq))
-			return rq;
-	}
-
-	return NULL;
-}
-
-/*
- * For the specified data direction, return the next request to dispatch using
- * sector position sorted lists.
- */
-static struct request *
-deadline_next_request(struct deadline_data *dd, int data_dir)
-{
-	struct request *rq;
-
-	if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
-		return NULL;
-
-	rq = dd->next_rq[data_dir];
-	if (!rq)
-		return NULL;
-
-	if (data_dir == READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone.
-	 */
-	while (rq) {
-		if (blk_req_can_dispatch_to_zone(rq))
-			return rq;
-		rq = deadline_latter_request(rq);
-	}
-
-	return NULL;
-}
-
-/*
- * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
- */
-static int deadline_dispatch_requests(struct request_queue *q, int force)
-{
-	struct deadline_data *dd = q->elevator->elevator_data;
-	const int reads = !list_empty(&dd->fifo_list[READ]);
-	const int writes = !list_empty(&dd->fifo_list[WRITE]);
-	struct request *rq, *next_rq;
-	int data_dir;
-
-	/*
-	 * batches are currently reads XOR writes
-	 */
-	rq = deadline_next_request(dd, WRITE);
-	if (!rq)
-		rq = deadline_next_request(dd, READ);
-
-	if (rq && dd->batching < dd->fifo_batch)
-		/* we have a next request are still entitled to batch */
-		goto dispatch_request;
-
-	/*
-	 * at this point we are not running a batch. select the appropriate
-	 * data direction (read / write)
-	 */
-
-	if (reads) {
-		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
-
-		if (deadline_fifo_request(dd, WRITE) &&
-		    (dd->starved++ >= dd->writes_starved))
-			goto dispatch_writes;
-
-		data_dir = READ;
-
-		goto dispatch_find_request;
-	}
-
-	/*
-	 * there are either no reads or writes have been starved
-	 */
-
-	if (writes) {
-dispatch_writes:
-		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
-
-		dd->starved = 0;
-
-		data_dir = WRITE;
-
-		goto dispatch_find_request;
-	}
-
-	return 0;
-
-dispatch_find_request:
-	/*
-	 * we are not running a batch, find best request for selected data_dir
-	 */
-	next_rq = deadline_next_request(dd, data_dir);
-	if (deadline_check_fifo(dd, data_dir) || !next_rq) {
-		/*
-		 * A deadline has expired, the last request was in the other
-		 * direction, or we have run out of higher-sectored requests.
-		 * Start again from the request with the earliest expiry time.
-		 */
-		rq = deadline_fifo_request(dd, data_dir);
-	} else {
-		/*
-		 * The last req was the same dir and we have a next request in
-		 * sort order. No expired requests so continue on from here.
-		 */
-		rq = next_rq;
-	}
-
-	/*
-	 * For a zoned block device, if we only have writes queued and none of
-	 * them can be dispatched, rq will be NULL.
-	 */
-	if (!rq)
-		return 0;
-
-	dd->batching = 0;
-
-dispatch_request:
-	/*
-	 * rq is the selected appropriate request.
-	 */
-	dd->batching++;
-	deadline_move_request(dd, rq);
-
-	return 1;
-}
-
-/*
- * For zoned block devices, write unlock the target zone of completed
- * write requests.
- */
-static void
-deadline_completed_request(struct request_queue *q, struct request *rq)
-{
-	blk_req_zone_write_unlock(rq);
-}
-
-static void deadline_exit_queue(struct elevator_queue *e)
-{
-	struct deadline_data *dd = e->elevator_data;
-
-	BUG_ON(!list_empty(&dd->fifo_list[READ]));
-	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
-
-	kfree(dd);
-}
-
-/*
- * initialize elevator private data (deadline_data).
- */
-static int deadline_init_queue(struct request_queue *q, struct elevator_type *e)
-{
-	struct deadline_data *dd;
-	struct elevator_queue *eq;
-
-	eq = elevator_alloc(q, e);
-	if (!eq)
-		return -ENOMEM;
-
-	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
-	if (!dd) {
-		kobject_put(&eq->kobj);
-		return -ENOMEM;
-	}
-	eq->elevator_data = dd;
-
-	INIT_LIST_HEAD(&dd->fifo_list[READ]);
-	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
-	dd->sort_list[READ] = RB_ROOT;
-	dd->sort_list[WRITE] = RB_ROOT;
-	dd->fifo_expire[READ] = read_expire;
-	dd->fifo_expire[WRITE] = write_expire;
-	dd->writes_starved = writes_starved;
-	dd->front_merges = 1;
-	dd->fifo_batch = fifo_batch;
-
-	spin_lock_irq(q->queue_lock);
-	q->elevator = eq;
-	spin_unlock_irq(q->queue_lock);
-	return 0;
-}
-
-/*
- * sysfs parts below
- */
-
-static ssize_t
-deadline_var_show(int var, char *page)
-{
-	return sprintf(page, "%d\n", var);
-}
-
-static void
-deadline_var_store(int *var, const char *page)
-{
-	char *p = (char *) page;
-
-	*var = simple_strtol(p, &p, 10);
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
-static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
-{									\
-	struct deadline_data *dd = e->elevator_data;			\
-	int __data = __VAR;						\
-	if (__CONV)							\
-		__data = jiffies_to_msecs(__data);			\
-	return deadline_var_show(__data, (page));			\
-}
-SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
-{									\
-	struct deadline_data *dd = e->elevator_data;			\
-	int __data;							\
-	deadline_var_store(&__data, (page));				\
-	if (__data < (MIN))						\
-		__data = (MIN);						\
-	else if (__data > (MAX))					\
-		__data = (MAX);						\
-	if (__CONV)							\
-		*(__PTR) = msecs_to_jiffies(__data);			\
-	else								\
-		*(__PTR) = __data;					\
-	return count;							\
-}
-STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
-#undef STORE_FUNCTION
-
-#define DD_ATTR(name) \
-	__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
-
-static struct elv_fs_entry deadline_attrs[] = {
-	DD_ATTR(read_expire),
-	DD_ATTR(write_expire),
-	DD_ATTR(writes_starved),
-	DD_ATTR(front_merges),
-	DD_ATTR(fifo_batch),
-	__ATTR_NULL
-};
-
-static struct elevator_type iosched_deadline = {
-	.ops.sq = {
-		.elevator_merge_fn = 		deadline_merge,
-		.elevator_merged_fn =		deadline_merged_request,
-		.elevator_merge_req_fn =	deadline_merged_requests,
-		.elevator_dispatch_fn =		deadline_dispatch_requests,
-		.elevator_completed_req_fn =	deadline_completed_request,
-		.elevator_add_req_fn =		deadline_add_request,
-		.elevator_former_req_fn =	elv_rb_former_request,
-		.elevator_latter_req_fn =	elv_rb_latter_request,
-		.elevator_init_fn =		deadline_init_queue,
-		.elevator_exit_fn =		deadline_exit_queue,
-	},
-
-	.elevator_attrs = deadline_attrs,
-	.elevator_name = "deadline",
-	.elevator_owner = THIS_MODULE,
-};
-
-static int __init deadline_init(void)
-{
-	return elv_register(&iosched_deadline);
-}
-
-static void __exit deadline_exit(void)
-{
-	elv_unregister(&iosched_deadline);
-}
-
-module_init(deadline_init);
-module_exit(deadline_exit);
-
-MODULE_AUTHOR("Jens Axboe");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("deadline IO scheduler");
diff --git a/block/elevator.c b/block/elevator.c
index 8fdcd64ae12e..54e1adac26c5 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -225,8 +225,6 @@ int elevator_init(struct request_queue *q)
 							chosen_elevator);
 	}
 
-	if (!e)
-		e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false);
 	if (!e) {
 		printk(KERN_ERR
 			"Default I/O scheduler not found. Using noop.\n");
@@ -356,68 +354,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 }
 EXPORT_SYMBOL(elv_rb_find);
 
-/*
- * Insert rq into dispatch queue of q.  Queue lock must be held on
- * entry.  rq is sort instead into the dispatch queue. To be used by
- * specific elevators.
- */
-void elv_dispatch_sort(struct request_queue *q, struct request *rq)
-{
-	sector_t boundary;
-	struct list_head *entry;
-
-	if (q->last_merge == rq)
-		q->last_merge = NULL;
-
-	elv_rqhash_del(q, rq);
-
-	q->nr_sorted--;
-
-	boundary = q->end_sector;
-	list_for_each_prev(entry, &q->queue_head) {
-		struct request *pos = list_entry_rq(entry);
-
-		if (req_op(rq) != req_op(pos))
-			break;
-		if (rq_data_dir(rq) != rq_data_dir(pos))
-			break;
-		if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER))
-			break;
-		if (blk_rq_pos(rq) >= boundary) {
-			if (blk_rq_pos(pos) < boundary)
-				continue;
-		} else {
-			if (blk_rq_pos(pos) >= boundary)
-				break;
-		}
-		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
-			break;
-	}
-
-	list_add(&rq->queuelist, entry);
-}
-EXPORT_SYMBOL(elv_dispatch_sort);
-
-/*
- * Insert rq into dispatch queue of q.  Queue lock must be held on
- * entry.  rq is added to the back of the dispatch queue. To be used by
- * specific elevators.
- */
-void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
-{
-	if (q->last_merge == rq)
-		q->last_merge = NULL;
-
-	elv_rqhash_del(q, rq);
-
-	q->nr_sorted--;
-
-	q->end_sector = rq_end_sector(rq);
-	q->boundary_rq = rq;
-	list_add_tail(&rq->queuelist, &q->queue_head);
-}
-EXPORT_SYMBOL(elv_dispatch_add_tail);
-
 enum elv_merge elv_merge(struct request_queue *q, struct request **req,
 		struct bio *bio)
 {
@@ -881,12 +817,6 @@ int elv_register(struct elevator_type *e)
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock(&elv_list_lock);
 
-	/* print pretty message */
-	if (elevator_match(e, chosen_elevator) ||
-			(!*chosen_elevator &&
-			 elevator_match(e, CONFIG_DEFAULT_IOSCHED)))
-				def = " (default)";
-
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
 								def);
 	return 0;
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
deleted file mode 100644
index 2d1b15d89b45..000000000000
--- a/block/noop-iosched.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * elevator noop
- */
-#include <linux/blkdev.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-
-struct noop_data {
-	struct list_head queue;
-};
-
-static void noop_merged_requests(struct request_queue *q, struct request *rq,
-				 struct request *next)
-{
-	list_del_init(&next->queuelist);
-}
-
-static int noop_dispatch(struct request_queue *q, int force)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-	struct request *rq;
-
-	rq = list_first_entry_or_null(&nd->queue, struct request, queuelist);
-	if (rq) {
-		list_del_init(&rq->queuelist);
-		elv_dispatch_sort(q, rq);
-		return 1;
-	}
-	return 0;
-}
-
-static void noop_add_request(struct request_queue *q, struct request *rq)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-
-	list_add_tail(&rq->queuelist, &nd->queue);
-}
-
-static struct request *
-noop_former_request(struct request_queue *q, struct request *rq)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-
-	if (rq->queuelist.prev == &nd->queue)
-		return NULL;
-	return list_prev_entry(rq, queuelist);
-}
-
-static struct request *
-noop_latter_request(struct request_queue *q, struct request *rq)
-{
-	struct noop_data *nd = q->elevator->elevator_data;
-
-	if (rq->queuelist.next == &nd->queue)
-		return NULL;
-	return list_next_entry(rq, queuelist);
-}
-
-static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
-{
-	struct noop_data *nd;
-	struct elevator_queue *eq;
-
-	eq = elevator_alloc(q, e);
-	if (!eq)
-		return -ENOMEM;
-
-	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
-	if (!nd) {
-		kobject_put(&eq->kobj);
-		return -ENOMEM;
-	}
-	eq->elevator_data = nd;
-
-	INIT_LIST_HEAD(&nd->queue);
-
-	spin_lock_irq(q->queue_lock);
-	q->elevator = eq;
-	spin_unlock_irq(q->queue_lock);
-	return 0;
-}
-
-static void noop_exit_queue(struct elevator_queue *e)
-{
-	struct noop_data *nd = e->elevator_data;
-
-	BUG_ON(!list_empty(&nd->queue));
-	kfree(nd);
-}
-
-static struct elevator_type elevator_noop = {
-	.ops.sq = {
-		.elevator_merge_req_fn		= noop_merged_requests,
-		.elevator_dispatch_fn		= noop_dispatch,
-		.elevator_add_req_fn		= noop_add_request,
-		.elevator_former_req_fn		= noop_former_request,
-		.elevator_latter_req_fn		= noop_latter_request,
-		.elevator_init_fn		= noop_init_queue,
-		.elevator_exit_fn		= noop_exit_queue,
-	},
-	.elevator_name = "noop",
-	.elevator_owner = THIS_MODULE,
-};
-
-static int __init noop_init(void)
-{
-	return elv_register(&elevator_noop);
-}
-
-static void __exit noop_exit(void)
-{
-	elv_unregister(&elevator_noop);
-}
-
-module_init(noop_init);
-module_exit(noop_exit);
-
-
-MODULE_AUTHOR("Jens Axboe");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("No-op IO scheduler");

From a1ce35fa49852db60fc6e268038530be533c5b15 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:23:51 -0600
Subject: [PATCH 027/351] block: remove dead elevator code

This removes a bunch of core and elevator related code. On the core
front, we remove anything related to queue running, draining,
initialization, plugging, and congestions. We also kill anything
related to request allocation, merging, retrieval, and completion.

Remove any checking for single queue IO schedulers, as they no
longer exist. This means we can also delete a bunch of code related
to request issue, adding, completion, etc - and all the SQ related
ops and helpers.

Also kill the load_default_modules(), as all that did was provide
for a way to load the default single queue elevator.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c      |    1 -
 block/blk-core.c         | 1709 +-------------------------------------
 block/blk-exec.c         |   20 +-
 block/blk-ioc.c          |   33 +-
 block/blk-merge.c        |    5 -
 block/blk-settings.c     |   36 -
 block/blk-sysfs.c        |   36 +-
 block/blk.h              |   51 --
 block/elevator.c         |  377 +--------
 block/kyber-iosched.c    |    1 -
 block/mq-deadline.c      |    1 -
 include/linux/blkdev.h   |   93 +--
 include/linux/elevator.h |   90 +-
 include/linux/init.h     |    1 -
 init/do_mounts_initrd.c  |    3 -
 init/initramfs.c         |    6 -
 init/main.c              |   12 -
 17 files changed, 55 insertions(+), 2420 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3a27d31fcda6..44c7e567aa25 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5745,7 +5745,6 @@ static struct elevator_type iosched_bfq_mq = {
 		.exit_sched		= bfq_exit_queue,
 	},
 
-	.uses_mq =		true,
 	.icq_size =		sizeof(struct bfq_io_cq),
 	.icq_align =		__alignof__(struct bfq_io_cq),
 	.elevator_attrs =	bfq_attrs,
diff --git a/block/blk-core.c b/block/blk-core.c
index daaed4dfa719..18538a41a532 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -144,46 +144,6 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
 
-static void blk_clear_congested(struct request_list *rl, int sync)
-{
-#ifdef CONFIG_CGROUP_WRITEBACK
-	clear_wb_congested(rl->blkg->wb_congested, sync);
-#else
-	/*
-	 * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
-	 * flip its congestion state for events on other blkcgs.
-	 */
-	if (rl == &rl->q->root_rl)
-		clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
-#endif
-}
-
-static void blk_set_congested(struct request_list *rl, int sync)
-{
-#ifdef CONFIG_CGROUP_WRITEBACK
-	set_wb_congested(rl->blkg->wb_congested, sync);
-#else
-	/* see blk_clear_congested() */
-	if (rl == &rl->q->root_rl)
-		set_wb_congested(rl->q->backing_dev_info->wb.congested, sync);
-#endif
-}
-
-void blk_queue_congestion_threshold(struct request_queue *q)
-{
-	int nr;
-
-	nr = q->nr_requests - (q->nr_requests / 8) + 1;
-	if (nr > q->nr_requests)
-		nr = q->nr_requests;
-	q->nr_congestion_on = nr;
-
-	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
-	if (nr < 1)
-		nr = 1;
-	q->nr_congestion_off = nr;
-}
-
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
@@ -292,99 +252,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
-static void blk_delay_work(struct work_struct *work)
-{
-	struct request_queue *q;
-
-	q = container_of(work, struct request_queue, delay_work.work);
-	spin_lock_irq(q->queue_lock);
-	__blk_run_queue(q);
-	spin_unlock_irq(q->queue_lock);
-}
-
-/**
- * blk_delay_queue - restart queueing after defined interval
- * @q:		The &struct request_queue in question
- * @msecs:	Delay in msecs
- *
- * Description:
- *   Sometimes queueing needs to be postponed for a little while, to allow
- *   resources to come back. This function will make sure that queueing is
- *   restarted around the specified time.
- */
-void blk_delay_queue(struct request_queue *q, unsigned long msecs)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (likely(!blk_queue_dead(q)))
-		queue_delayed_work(kblockd_workqueue, &q->delay_work,
-				   msecs_to_jiffies(msecs));
-}
-EXPORT_SYMBOL(blk_delay_queue);
-
-/**
- * blk_start_queue_async - asynchronously restart a previously stopped queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   blk_start_queue_async() will clear the stop flag on the queue, and
- *   ensure that the request_fn for the queue is run from an async
- *   context.
- **/
-void blk_start_queue_async(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	blk_run_queue_async(q);
-}
-EXPORT_SYMBOL(blk_start_queue_async);
-
-/**
- * blk_start_queue - restart a previously stopped queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   blk_start_queue() will clear the stop flag on the queue, and call
- *   the request_fn for the queue if it was in a stopped state when
- *   entered. Also see blk_stop_queue().
- **/
-void blk_start_queue(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	__blk_run_queue(q);
-}
-EXPORT_SYMBOL(blk_start_queue);
-
-/**
- * blk_stop_queue - stop a queue
- * @q:    The &struct request_queue in question
- *
- * Description:
- *   The Linux block layer assumes that a block driver will consume all
- *   entries on the request queue when the request_fn strategy is called.
- *   Often this will not happen, because of hardware limitations (queue
- *   depth settings). If a device driver gets a 'queue full' response,
- *   or if it simply chooses not to queue more I/O at one point, it can
- *   call this function to prevent the request_fn from being called until
- *   the driver has signalled it's ready to go again. This happens by calling
- *   blk_start_queue() to restart queue operations.
- **/
-void blk_stop_queue(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	cancel_delayed_work(&q->delay_work);
-	queue_flag_set(QUEUE_FLAG_STOPPED, q);
-}
-EXPORT_SYMBOL(blk_stop_queue);
-
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
@@ -415,8 +282,6 @@ void blk_sync_queue(struct request_queue *q)
 		cancel_delayed_work_sync(&q->requeue_work);
 		queue_for_each_hw_ctx(q, hctx, i)
 			cancel_delayed_work_sync(&hctx->run_work);
-	} else {
-		cancel_delayed_work_sync(&q->delay_work);
 	}
 }
 EXPORT_SYMBOL(blk_sync_queue);
@@ -442,250 +307,12 @@ void blk_clear_pm_only(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_clear_pm_only);
 
-/**
- * __blk_run_queue_uncond - run a queue whether or not it has been stopped
- * @q:	The queue to run
- *
- * Description:
- *    Invoke request handling on a queue if there are any pending requests.
- *    May be used to restart request handling after a request has completed.
- *    This variant runs the queue whether or not the queue has been
- *    stopped. Must be called with the queue lock held and interrupts
- *    disabled. See also @blk_run_queue.
- */
-inline void __blk_run_queue_uncond(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (unlikely(blk_queue_dead(q)))
-		return;
-
-	/*
-	 * Some request_fn implementations, e.g. scsi_request_fn(), unlock
-	 * the queue lock internally. As a result multiple threads may be
-	 * running such a request function concurrently. Keep track of the
-	 * number of active request_fn invocations such that blk_drain_queue()
-	 * can wait until all these request_fn calls have finished.
-	 */
-	q->request_fn_active++;
-	q->request_fn(q);
-	q->request_fn_active--;
-}
-EXPORT_SYMBOL_GPL(__blk_run_queue_uncond);
-
-/**
- * __blk_run_queue - run a single device queue
- * @q:	The queue to run
- *
- * Description:
- *    See @blk_run_queue.
- */
-void __blk_run_queue(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (unlikely(blk_queue_stopped(q)))
-		return;
-
-	__blk_run_queue_uncond(q);
-}
-EXPORT_SYMBOL(__blk_run_queue);
-
-/**
- * blk_run_queue_async - run a single device queue in workqueue context
- * @q:	The queue to run
- *
- * Description:
- *    Tells kblockd to perform the equivalent of @blk_run_queue on behalf
- *    of us.
- *
- * Note:
- *    Since it is not allowed to run q->delay_work after blk_cleanup_queue()
- *    has canceled q->delay_work, callers must hold the queue lock to avoid
- *    race conditions between blk_cleanup_queue() and blk_run_queue_async().
- */
-void blk_run_queue_async(struct request_queue *q)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q)))
-		mod_delayed_work(kblockd_workqueue, &q->delay_work, 0);
-}
-EXPORT_SYMBOL(blk_run_queue_async);
-
-/**
- * blk_run_queue - run a single device queue
- * @q: The queue to run
- *
- * Description:
- *    Invoke request handling on this queue, if it has pending work to do.
- *    May be used to restart queueing when a request has completed.
- */
-void blk_run_queue(struct request_queue *q)
-{
-	unsigned long flags;
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_run_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_run_queue);
-
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 EXPORT_SYMBOL(blk_put_queue);
 
-/**
- * __blk_drain_queue - drain requests from request_queue
- * @q: queue to drain
- * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV
- *
- * Drain requests from @q.  If @drain_all is set, all requests are drained.
- * If not, only ELVPRIV requests are drained.  The caller is responsible
- * for ensuring that no new requests which need to be drained are queued.
- */
-static void __blk_drain_queue(struct request_queue *q, bool drain_all)
-	__releases(q->queue_lock)
-	__acquires(q->queue_lock)
-{
-	int i;
-
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	while (true) {
-		bool drain = false;
-
-		/*
-		 * The caller might be trying to drain @q before its
-		 * elevator is initialized.
-		 */
-		if (q->elevator)
-			elv_drain_elevator(q);
-
-		blkcg_drain_queue(q);
-
-		/*
-		 * This function might be called on a queue which failed
-		 * driver init after queue creation or is not yet fully
-		 * active yet.  Some drivers (e.g. fd and loop) get unhappy
-		 * in such cases.  Kick queue iff dispatch queue has
-		 * something on it and @q has request_fn set.
-		 */
-		if (!list_empty(&q->queue_head) && q->request_fn)
-			__blk_run_queue(q);
-
-		drain |= q->nr_rqs_elvpriv;
-		drain |= q->request_fn_active;
-
-		/*
-		 * Unfortunately, requests are queued at and tracked from
-		 * multiple places and there's no single counter which can
-		 * be drained.  Check all the queues and counters.
-		 */
-		if (drain_all) {
-			struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-			drain |= !list_empty(&q->queue_head);
-			for (i = 0; i < 2; i++) {
-				drain |= q->nr_rqs[i];
-				drain |= q->in_flight[i];
-				if (fq)
-				    drain |= !list_empty(&fq->flush_queue[i]);
-			}
-		}
-
-		if (!drain)
-			break;
-
-		spin_unlock_irq(q->queue_lock);
-
-		msleep(10);
-
-		spin_lock_irq(q->queue_lock);
-	}
-
-	/*
-	 * With queue marked dead, any woken up waiter will fail the
-	 * allocation path, so the wakeup chaining is lost and we're
-	 * left with hung waiters. We need to wake up those waiters.
-	 */
-	if (q->request_fn) {
-		struct request_list *rl;
-
-		blk_queue_for_each_rl(rl, q)
-			for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
-				wake_up_all(&rl->wait[i]);
-	}
-}
-
-void blk_drain_queue(struct request_queue *q)
-{
-	spin_lock_irq(q->queue_lock);
-	__blk_drain_queue(q, true);
-	spin_unlock_irq(q->queue_lock);
-}
-
-/**
- * blk_queue_bypass_start - enter queue bypass mode
- * @q: queue of interest
- *
- * In bypass mode, only the dispatch FIFO queue of @q is used.  This
- * function makes @q enter bypass mode and drains all requests which were
- * throttled or issued before.  On return, it's guaranteed that no request
- * is being throttled or has ELVPRIV set and blk_queue_bypass() %true
- * inside queue or RCU read lock.
- */
-void blk_queue_bypass_start(struct request_queue *q)
-{
-	WARN_ON_ONCE(q->mq_ops);
-
-	spin_lock_irq(q->queue_lock);
-	q->bypass_depth++;
-	queue_flag_set(QUEUE_FLAG_BYPASS, q);
-	spin_unlock_irq(q->queue_lock);
-
-	/*
-	 * Queues start drained.  Skip actual draining till init is
-	 * complete.  This avoids lenghty delays during queue init which
-	 * can happen many times during boot.
-	 */
-	if (blk_queue_init_done(q)) {
-		spin_lock_irq(q->queue_lock);
-		__blk_drain_queue(q, false);
-		spin_unlock_irq(q->queue_lock);
-
-		/* ensure blk_queue_bypass() is %true inside RCU read lock */
-		synchronize_rcu();
-	}
-}
-EXPORT_SYMBOL_GPL(blk_queue_bypass_start);
-
-/**
- * blk_queue_bypass_end - leave queue bypass mode
- * @q: queue of interest
- *
- * Leave bypass mode and restore the normal queueing behavior.
- *
- * Note: although blk_queue_bypass_start() is only called for blk-sq queues,
- * this function is called for both blk-sq and blk-mq queues.
- */
-void blk_queue_bypass_end(struct request_queue *q)
-{
-	spin_lock_irq(q->queue_lock);
-	if (!--q->bypass_depth)
-		queue_flag_clear(QUEUE_FLAG_BYPASS, q);
-	WARN_ON_ONCE(q->bypass_depth < 0);
-	spin_unlock_irq(q->queue_lock);
-}
-EXPORT_SYMBOL_GPL(blk_queue_bypass_end);
-
 void blk_set_queue_dying(struct request_queue *q)
 {
 	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
@@ -699,18 +326,6 @@ void blk_set_queue_dying(struct request_queue *q)
 
 	if (q->mq_ops)
 		blk_mq_wake_waiters(q);
-	else {
-		struct request_list *rl;
-
-		spin_lock_irq(q->queue_lock);
-		blk_queue_for_each_rl(rl, q) {
-			if (rl->rq_pool) {
-				wake_up_all(&rl->wait[BLK_RW_SYNC]);
-				wake_up_all(&rl->wait[BLK_RW_ASYNC]);
-			}
-		}
-		spin_unlock_irq(q->queue_lock);
-	}
 
 	/* Make blk_queue_enter() reexamine the DYING flag. */
 	wake_up_all(&q->mq_freeze_wq);
@@ -822,6 +437,7 @@ void blk_cleanup_queue(struct request_queue *q)
 
 	if (q->mq_ops)
 		blk_mq_free_queue(q);
+
 	percpu_ref_exit(&q->q_usage_counter);
 
 	spin_lock_irq(lock);
@@ -1013,8 +629,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
-	q->end_sector = 0;
-	q->boundary_rq = NULL;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
 	if (q->id < 0)
@@ -1047,7 +661,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 #ifdef CONFIG_BLK_CGROUP
 	INIT_LIST_HEAD(&q->blkg_list);
 #endif
-	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 
 	kobject_init(&q->kobj, &blk_queue_ktype);
 
@@ -1100,105 +713,6 @@ fail_q:
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 
-/**
- * blk_init_queue  - prepare a request queue for use with a block device
- * @rfn:  The function to be called to process requests that have been
- *        placed on the queue.
- * @lock: Request queue spin lock
- *
- * Description:
- *    If a block device wishes to use the standard request handling procedures,
- *    which sorts requests and coalesces adjacent requests, then it must
- *    call blk_init_queue().  The function @rfn will be called when there
- *    are requests on the queue that need to be processed.  If the device
- *    supports plugging, then @rfn may not be called immediately when requests
- *    are available on the queue, but may be called at some time later instead.
- *    Plugged queues are generally unplugged when a buffer belonging to one
- *    of the requests on the queue is needed, or due to memory pressure.
- *
- *    @rfn is not required, or even expected, to remove all requests off the
- *    queue, but only as many as it can handle at a time.  If it does leave
- *    requests on the queue, it is responsible for arranging that the requests
- *    get dealt with eventually.
- *
- *    The queue spin lock must be held while manipulating the requests on the
- *    request queue; this lock will be taken also from interrupt context, so irq
- *    disabling is needed for it.
- *
- *    Function returns a pointer to the initialized request queue, or %NULL if
- *    it didn't succeed.
- *
- * Note:
- *    blk_init_queue() must be paired with a blk_cleanup_queue() call
- *    when the block device is deactivated (such as at module unload).
- **/
-
-struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
-{
-	return blk_init_queue_node(rfn, lock, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(blk_init_queue);
-
-struct request_queue *
-blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
-{
-	struct request_queue *q;
-
-	q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock);
-	if (!q)
-		return NULL;
-
-	q->request_fn = rfn;
-	if (blk_init_allocated_queue(q) < 0) {
-		blk_cleanup_queue(q);
-		return NULL;
-	}
-
-	return q;
-}
-EXPORT_SYMBOL(blk_init_queue_node);
-
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio);
-
-
-int blk_init_allocated_queue(struct request_queue *q)
-{
-	WARN_ON_ONCE(q->mq_ops);
-
-	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL);
-	if (!q->fq)
-		return -ENOMEM;
-
-	if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL))
-		goto out_free_flush_queue;
-
-	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
-		goto out_exit_flush_rq;
-
-	INIT_WORK(&q->timeout_work, blk_timeout_work);
-	q->queue_flags		|= QUEUE_FLAG_DEFAULT;
-
-	/*
-	 * This also sets hw/phys segments, boundary and size
-	 */
-	blk_queue_make_request(q, blk_queue_bio);
-
-	q->sg_reserved_size = INT_MAX;
-
-	if (elevator_init(q))
-		goto out_exit_flush_rq;
-	return 0;
-
-out_exit_flush_rq:
-	if (q->exit_rq_fn)
-		q->exit_rq_fn(q, q->fq->flush_rq);
-out_free_flush_queue:
-	blk_free_flush_queue(q->fq);
-	q->fq = NULL;
-	return -ENOMEM;
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
 bool blk_get_queue(struct request_queue *q)
 {
 	if (likely(!blk_queue_dying(q))) {
@@ -1210,406 +724,6 @@ bool blk_get_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_get_queue);
 
-static inline void blk_free_request(struct request_list *rl, struct request *rq)
-{
-	if (rq->rq_flags & RQF_ELVPRIV) {
-		elv_put_request(rl->q, rq);
-		if (rq->elv.icq)
-			put_io_context(rq->elv.icq->ioc);
-	}
-
-	mempool_free(rq, rl->rq_pool);
-}
-
-/*
- * ioc_batching returns true if the ioc is a valid batching request and
- * should be given priority access to a request.
- */
-static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
-{
-	if (!ioc)
-		return 0;
-
-	/*
-	 * Make sure the process is able to allocate at least 1 request
-	 * even if the batch times out, otherwise we could theoretically
-	 * lose wakeups.
-	 */
-	return ioc->nr_batch_requests == q->nr_batching ||
-		(ioc->nr_batch_requests > 0
-		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
-}
-
-/*
- * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
- * will cause the process to be a "batcher" on all queues in the system. This
- * is the behaviour we want though - once it gets a wakeup it should be given
- * a nice run.
- */
-static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
-{
-	if (!ioc || ioc_batching(q, ioc))
-		return;
-
-	ioc->nr_batch_requests = q->nr_batching;
-	ioc->last_waited = jiffies;
-}
-
-static void __freed_request(struct request_list *rl, int sync)
-{
-	struct request_queue *q = rl->q;
-
-	if (rl->count[sync] < queue_congestion_off_threshold(q))
-		blk_clear_congested(rl, sync);
-
-	if (rl->count[sync] + 1 <= q->nr_requests) {
-		if (waitqueue_active(&rl->wait[sync]))
-			wake_up(&rl->wait[sync]);
-
-		blk_clear_rl_full(rl, sync);
-	}
-}
-
-/*
- * A request has just been released.  Account for it, update the full and
- * congestion status, wake up any waiters.   Called under q->queue_lock.
- */
-static void freed_request(struct request_list *rl, bool sync,
-		req_flags_t rq_flags)
-{
-	struct request_queue *q = rl->q;
-
-	q->nr_rqs[sync]--;
-	rl->count[sync]--;
-	if (rq_flags & RQF_ELVPRIV)
-		q->nr_rqs_elvpriv--;
-
-	__freed_request(rl, sync);
-
-	if (unlikely(rl->starved[sync ^ 1]))
-		__freed_request(rl, sync ^ 1);
-}
-
-int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
-{
-	struct request_list *rl;
-	int on_thresh, off_thresh;
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	spin_lock_irq(q->queue_lock);
-	q->nr_requests = nr;
-	blk_queue_congestion_threshold(q);
-	on_thresh = queue_congestion_on_threshold(q);
-	off_thresh = queue_congestion_off_threshold(q);
-
-	blk_queue_for_each_rl(rl, q) {
-		if (rl->count[BLK_RW_SYNC] >= on_thresh)
-			blk_set_congested(rl, BLK_RW_SYNC);
-		else if (rl->count[BLK_RW_SYNC] < off_thresh)
-			blk_clear_congested(rl, BLK_RW_SYNC);
-
-		if (rl->count[BLK_RW_ASYNC] >= on_thresh)
-			blk_set_congested(rl, BLK_RW_ASYNC);
-		else if (rl->count[BLK_RW_ASYNC] < off_thresh)
-			blk_clear_congested(rl, BLK_RW_ASYNC);
-
-		if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_SYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_SYNC);
-			wake_up(&rl->wait[BLK_RW_SYNC]);
-		}
-
-		if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) {
-			blk_set_rl_full(rl, BLK_RW_ASYNC);
-		} else {
-			blk_clear_rl_full(rl, BLK_RW_ASYNC);
-			wake_up(&rl->wait[BLK_RW_ASYNC]);
-		}
-	}
-
-	spin_unlock_irq(q->queue_lock);
-	return 0;
-}
-
-/**
- * __get_request - get a free request
- * @rl: request list to allocate from
- * @op: operation and flags
- * @bio: bio to allocate request for (can be %NULL)
- * @flags: BLQ_MQ_REQ_* flags
- * @gfp_mask: allocator flags
- *
- * Get a free request from @q.  This function may fail under memory
- * pressure or if @q is dead.
- *
- * Must be called with @q->queue_lock held and,
- * Returns ERR_PTR on failure, with @q->queue_lock held.
- * Returns request pointer on success, with @q->queue_lock *not held*.
- */
-static struct request *__get_request(struct request_list *rl, unsigned int op,
-		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask)
-{
-	struct request_queue *q = rl->q;
-	struct request *rq;
-	struct elevator_type *et = q->elevator->type;
-	struct io_context *ioc = rq_ioc(bio);
-	struct io_cq *icq = NULL;
-	const bool is_sync = op_is_sync(op);
-	int may_queue;
-	req_flags_t rq_flags = RQF_ALLOCED;
-
-	lockdep_assert_held(q->queue_lock);
-
-	if (unlikely(blk_queue_dying(q)))
-		return ERR_PTR(-ENODEV);
-
-	may_queue = elv_may_queue(q, op);
-	if (may_queue == ELV_MQUEUE_NO)
-		goto rq_starved;
-
-	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
-		if (rl->count[is_sync]+1 >= q->nr_requests) {
-			/*
-			 * The queue will fill after this allocation, so set
-			 * it as full, and mark this process as "batching".
-			 * This process will be allowed to complete a batch of
-			 * requests, others will be blocked.
-			 */
-			if (!blk_rl_full(rl, is_sync)) {
-				ioc_set_batching(q, ioc);
-				blk_set_rl_full(rl, is_sync);
-			} else {
-				if (may_queue != ELV_MQUEUE_MUST
-						&& !ioc_batching(q, ioc)) {
-					/*
-					 * The queue is full and the allocating
-					 * process is not a "batcher", and not
-					 * exempted by the IO scheduler
-					 */
-					return ERR_PTR(-ENOMEM);
-				}
-			}
-		}
-		blk_set_congested(rl, is_sync);
-	}
-
-	/*
-	 * Only allow batching queuers to allocate up to 50% over the defined
-	 * limit of requests, otherwise we could have thousands of requests
-	 * allocated with any setting of ->nr_requests
-	 */
-	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
-		return ERR_PTR(-ENOMEM);
-
-	q->nr_rqs[is_sync]++;
-	rl->count[is_sync]++;
-	rl->starved[is_sync] = 0;
-
-	/*
-	 * Decide whether the new request will be managed by elevator.  If
-	 * so, mark @rq_flags and increment elvpriv.  Non-zero elvpriv will
-	 * prevent the current elevator from being destroyed until the new
-	 * request is freed.  This guarantees icq's won't be destroyed and
-	 * makes creating new ones safe.
-	 *
-	 * Flush requests do not use the elevator so skip initialization.
-	 * This allows a request to share the flush and elevator data.
-	 *
-	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
-	 * it will be created after releasing queue_lock.
-	 */
-	if (!op_is_flush(op) && !blk_queue_bypass(q)) {
-		rq_flags |= RQF_ELVPRIV;
-		q->nr_rqs_elvpriv++;
-		if (et->icq_cache && ioc)
-			icq = ioc_lookup_icq(ioc, q);
-	}
-
-	if (blk_queue_io_stat(q))
-		rq_flags |= RQF_IO_STAT;
-	spin_unlock_irq(q->queue_lock);
-
-	/* allocate and init request */
-	rq = mempool_alloc(rl->rq_pool, gfp_mask);
-	if (!rq)
-		goto fail_alloc;
-
-	blk_rq_init(q, rq);
-	blk_rq_set_rl(rq, rl);
-	rq->cmd_flags = op;
-	rq->rq_flags = rq_flags;
-	if (flags & BLK_MQ_REQ_PREEMPT)
-		rq->rq_flags |= RQF_PREEMPT;
-
-	/* init elvpriv */
-	if (rq_flags & RQF_ELVPRIV) {
-		if (unlikely(et->icq_cache && !icq)) {
-			if (ioc)
-				icq = ioc_create_icq(ioc, q, gfp_mask);
-			if (!icq)
-				goto fail_elvpriv;
-		}
-
-		rq->elv.icq = icq;
-		if (unlikely(elv_set_request(q, rq, bio, gfp_mask)))
-			goto fail_elvpriv;
-
-		/* @rq->elv.icq holds io_context until @rq is freed */
-		if (icq)
-			get_io_context(icq->ioc);
-	}
-out:
-	/*
-	 * ioc may be NULL here, and ioc_batching will be false. That's
-	 * OK, if the queue is under the request limit then requests need
-	 * not count toward the nr_batch_requests limit. There will always
-	 * be some limit enforced by BLK_BATCH_TIME.
-	 */
-	if (ioc_batching(q, ioc))
-		ioc->nr_batch_requests--;
-
-	trace_block_getrq(q, bio, op);
-	return rq;
-
-fail_elvpriv:
-	/*
-	 * elvpriv init failed.  ioc, icq and elvpriv aren't mempool backed
-	 * and may fail indefinitely under memory pressure and thus
-	 * shouldn't stall IO.  Treat this request as !elvpriv.  This will
-	 * disturb iosched and blkcg but weird is bettern than dead.
-	 */
-	printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n",
-			   __func__, dev_name(q->backing_dev_info->dev));
-
-	rq->rq_flags &= ~RQF_ELVPRIV;
-	rq->elv.icq = NULL;
-
-	spin_lock_irq(q->queue_lock);
-	q->nr_rqs_elvpriv--;
-	spin_unlock_irq(q->queue_lock);
-	goto out;
-
-fail_alloc:
-	/*
-	 * Allocation failed presumably due to memory. Undo anything we
-	 * might have messed up.
-	 *
-	 * Allocating task should really be put onto the front of the wait
-	 * queue, but this is pretty rare.
-	 */
-	spin_lock_irq(q->queue_lock);
-	freed_request(rl, is_sync, rq_flags);
-
-	/*
-	 * in the very unlikely event that allocation failed and no
-	 * requests for this direction was pending, mark us starved so that
-	 * freeing of a request in the other direction will notice
-	 * us. another possible fix would be to split the rq mempool into
-	 * READ and WRITE
-	 */
-rq_starved:
-	if (unlikely(rl->count[is_sync] == 0))
-		rl->starved[is_sync] = 1;
-	return ERR_PTR(-ENOMEM);
-}
-
-/**
- * get_request - get a free request
- * @q: request_queue to allocate request from
- * @op: operation and flags
- * @bio: bio to allocate request for (can be %NULL)
- * @flags: BLK_MQ_REQ_* flags.
- * @gfp: allocator flags
- *
- * Get a free request from @q.  If %BLK_MQ_REQ_NOWAIT is set in @flags,
- * this function keeps retrying under memory pressure and fails iff @q is dead.
- *
- * Must be called with @q->queue_lock held and,
- * Returns ERR_PTR on failure, with @q->queue_lock held.
- * Returns request pointer on success, with @q->queue_lock *not held*.
- */
-static struct request *get_request(struct request_queue *q, unsigned int op,
-		struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp)
-{
-	const bool is_sync = op_is_sync(op);
-	DEFINE_WAIT(wait);
-	struct request_list *rl;
-	struct request *rq;
-
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
-retry:
-	rq = __get_request(rl, op, bio, flags, gfp);
-	if (!IS_ERR(rq))
-		return rq;
-
-	if (op & REQ_NOWAIT) {
-		blk_put_rl(rl);
-		return ERR_PTR(-EAGAIN);
-	}
-
-	if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) {
-		blk_put_rl(rl);
-		return rq;
-	}
-
-	/* wait on @rl and retry */
-	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
-				  TASK_UNINTERRUPTIBLE);
-
-	trace_block_sleeprq(q, bio, op);
-
-	spin_unlock_irq(q->queue_lock);
-	io_schedule();
-
-	/*
-	 * After sleeping, we become a "batching" process and will be able
-	 * to allocate at least one request, and up to a big batch of them
-	 * for a small period time.  See ioc_batching, ioc_set_batching
-	 */
-	ioc_set_batching(q, current->io_context);
-
-	spin_lock_irq(q->queue_lock);
-	finish_wait(&rl->wait[is_sync], &wait);
-
-	goto retry;
-}
-
-/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */
-static struct request *blk_old_get_request(struct request_queue *q,
-				unsigned int op, blk_mq_req_flags_t flags)
-{
-	struct request *rq;
-	gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO;
-	int ret = 0;
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	/* create ioc upfront */
-	create_io_context(gfp_mask, q->node);
-
-	ret = blk_queue_enter(q, flags);
-	if (ret)
-		return ERR_PTR(ret);
-	spin_lock_irq(q->queue_lock);
-	rq = get_request(q, op, NULL, flags, gfp_mask);
-	if (IS_ERR(rq)) {
-		spin_unlock_irq(q->queue_lock);
-		blk_queue_exit(q);
-		return rq;
-	}
-
-	/* q->queue_lock is unlocked at this point */
-	rq->__data_len = 0;
-	rq->__sector = (sector_t) -1;
-	rq->bio = rq->biotail = NULL;
-	return rq;
-}
-
 /**
  * blk_get_request - allocate a request
  * @q: request queue to allocate a request for
@@ -1624,53 +738,14 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
 	WARN_ON_ONCE(op & REQ_NOWAIT);
 	WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
 
-	if (q->mq_ops) {
-		req = blk_mq_alloc_request(q, op, flags);
-		if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
-			q->mq_ops->initialize_rq_fn(req);
-	} else {
-		req = blk_old_get_request(q, op, flags);
-		if (!IS_ERR(req) && q->initialize_rq_fn)
-			q->initialize_rq_fn(req);
-	}
+	req = blk_mq_alloc_request(q, op, flags);
+	if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
+		q->mq_ops->initialize_rq_fn(req);
 
 	return req;
 }
 EXPORT_SYMBOL(blk_get_request);
 
-/**
- * blk_requeue_request - put a request back on queue
- * @q:		request queue where request should be inserted
- * @rq:		request to be inserted
- *
- * Description:
- *    Drivers often keep queueing requests until the hardware cannot accept
- *    more, when that condition happens we need to put the request back
- *    on the queue. Must be called with queue lock held.
- */
-void blk_requeue_request(struct request_queue *q, struct request *rq)
-{
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	blk_delete_timer(rq);
-	blk_clear_rq_complete(rq);
-	trace_block_rq_requeue(q, rq);
-	rq_qos_requeue(q, rq);
-
-	BUG_ON(blk_queued_rq(rq));
-
-	elv_requeue_request(q, rq);
-}
-EXPORT_SYMBOL(blk_requeue_request);
-
-static void add_acct_request(struct request_queue *q, struct request *rq,
-			     int where)
-{
-	blk_account_io_start(rq, true);
-	__elv_add_request(q, rq, where);
-}
-
 static void part_round_stats_single(struct request_queue *q, int cpu,
 				    struct hd_struct *part, unsigned long now,
 				    unsigned int inflight)
@@ -1730,61 +805,16 @@ EXPORT_SYMBOL_GPL(part_round_stats);
 
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
-	req_flags_t rq_flags = req->rq_flags;
-
 	if (unlikely(!q))
 		return;
 
-	if (q->mq_ops) {
-		blk_mq_free_request(req);
-		return;
-	}
-
-	lockdep_assert_held(q->queue_lock);
-
-	blk_req_zone_write_unlock(req);
-	blk_pm_put_request(req);
-	blk_pm_mark_last_busy(req);
-
-	elv_completed_request(q, req);
-
-	/* this is a bio leak */
-	WARN_ON(req->bio != NULL);
-
-	rq_qos_done(q, req);
-
-	/*
-	 * Request may not have originated from ll_rw_blk. if not,
-	 * it didn't come out of our reserved rq pools
-	 */
-	if (rq_flags & RQF_ALLOCED) {
-		struct request_list *rl = blk_rq_rl(req);
-		bool sync = op_is_sync(req->cmd_flags);
-
-		BUG_ON(!list_empty(&req->queuelist));
-		BUG_ON(ELV_ON_HASH(req));
-
-		blk_free_request(rl, req);
-		freed_request(rl, sync, rq_flags);
-		blk_put_rl(rl);
-		blk_queue_exit(q);
-	}
+	blk_mq_free_request(req);
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 
 void blk_put_request(struct request *req)
 {
-	struct request_queue *q = req->q;
-
-	if (q->mq_ops)
-		blk_mq_free_request(req);
-	else {
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		__blk_put_request(q, req);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	blk_mq_free_request(req);
 }
 EXPORT_SYMBOL(blk_put_request);
 
@@ -1893,10 +923,7 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		return false;
 	*request_count = 0;
 
-	if (q->mq_ops)
-		plug_list = &plug->mq_list;
-	else
-		plug_list = &plug->list;
+	plug_list = &plug->mq_list;
 
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		bool merged = false;
@@ -1947,11 +974,7 @@ unsigned int blk_plug_queued_count(struct request_queue *q)
 	if (!plug)
 		goto out;
 
-	if (q->mq_ops)
-		plug_list = &plug->mq_list;
-	else
-		plug_list = &plug->list;
-
+	plug_list = &plug->mq_list;
 	list_for_each_entry(rq, plug_list, queuelist) {
 		if (rq->q == q)
 			ret++;
@@ -1979,133 +1002,6 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
 
-static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
-{
-	struct blk_plug *plug;
-	int where = ELEVATOR_INSERT_SORT;
-	struct request *req, *free;
-	unsigned int request_count = 0;
-
-	/*
-	 * low level driver can indicate that it wants pages above a
-	 * certain limit bounced to low memory (ie for highmem, or even
-	 * ISA dma in theory)
-	 */
-	blk_queue_bounce(q, &bio);
-
-	blk_queue_split(q, &bio);
-
-	if (!bio_integrity_prep(bio))
-		return BLK_QC_T_NONE;
-
-	if (op_is_flush(bio->bi_opf)) {
-		spin_lock_irq(q->queue_lock);
-		where = ELEVATOR_INSERT_FLUSH;
-		goto get_rq;
-	}
-
-	/*
-	 * Check if we can merge with the plugged list before grabbing
-	 * any locks.
-	 */
-	if (!blk_queue_nomerges(q)) {
-		if (blk_attempt_plug_merge(q, bio, &request_count, NULL))
-			return BLK_QC_T_NONE;
-	} else
-		request_count = blk_plug_queued_count(q);
-
-	spin_lock_irq(q->queue_lock);
-
-	switch (elv_merge(q, &req, bio)) {
-	case ELEVATOR_BACK_MERGE:
-		if (!bio_attempt_back_merge(q, req, bio))
-			break;
-		elv_bio_merged(q, req, bio);
-		free = attempt_back_merge(q, req);
-		if (free)
-			__blk_put_request(q, free);
-		else
-			elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
-		goto out_unlock;
-	case ELEVATOR_FRONT_MERGE:
-		if (!bio_attempt_front_merge(q, req, bio))
-			break;
-		elv_bio_merged(q, req, bio);
-		free = attempt_front_merge(q, req);
-		if (free)
-			__blk_put_request(q, free);
-		else
-			elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
-		goto out_unlock;
-	default:
-		break;
-	}
-
-get_rq:
-	rq_qos_throttle(q, bio, q->queue_lock);
-
-	/*
-	 * Grab a free request. This is might sleep but can not fail.
-	 * Returns with the queue unlocked.
-	 */
-	blk_queue_enter_live(q);
-	req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO);
-	if (IS_ERR(req)) {
-		blk_queue_exit(q);
-		rq_qos_cleanup(q, bio);
-		if (PTR_ERR(req) == -ENOMEM)
-			bio->bi_status = BLK_STS_RESOURCE;
-		else
-			bio->bi_status = BLK_STS_IOERR;
-		bio_endio(bio);
-		goto out_unlock;
-	}
-
-	rq_qos_track(q, req, bio);
-
-	/*
-	 * After dropping the lock and possibly sleeping here, our request
-	 * may now be mergeable after it had proven unmergeable (above).
-	 * We don't worry about that case for efficiency. It won't happen
-	 * often, and the elevators are able to handle it.
-	 */
-	blk_init_request_from_bio(req, bio);
-
-	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
-		req->cpu = raw_smp_processor_id();
-
-	plug = current->plug;
-	if (plug) {
-		/*
-		 * If this is the first request added after a plug, fire
-		 * of a plug trace.
-		 *
-		 * @request_count may become stale because of schedule
-		 * out, so check plug list again.
-		 */
-		if (!request_count || list_empty(&plug->list))
-			trace_block_plug(q);
-		else {
-			struct request *last = list_entry_rq(plug->list.prev);
-			if (request_count >= BLK_MAX_REQUEST_COUNT ||
-			    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) {
-				blk_flush_plug_list(plug, false);
-				trace_block_plug(q);
-			}
-		}
-		list_add_tail(&req->queuelist, &plug->list);
-		blk_account_io_start(req, true);
-	} else {
-		spin_lock_irq(q->queue_lock);
-		add_acct_request(q, req, where);
-		__blk_run_queue(q);
-out_unlock:
-		spin_unlock_irq(q->queue_lock);
-	}
-
-	return BLK_QC_T_NONE;
-}
-
 static void handle_bad_sector(struct bio *bio, sector_t maxsector)
 {
 	char b[BDEVNAME_SIZE];
@@ -2617,9 +1513,6 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
  */
 blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
-	unsigned long flags;
-	int where = ELEVATOR_INSERT_BACK;
-
 	if (blk_cloned_rq_check_limits(q, rq))
 		return BLK_STS_IOERR;
 
@@ -2627,38 +1520,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	    should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
 		return BLK_STS_IOERR;
 
-	if (q->mq_ops) {
-		if (blk_queue_io_stat(q))
-			blk_account_io_start(rq, true);
-		/*
-		 * Since we have a scheduler attached on the top device,
-		 * bypass a potential scheduler on the bottom device for
-		 * insert.
-		 */
-		return blk_mq_request_issue_directly(rq);
-	}
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	if (unlikely(blk_queue_dying(q))) {
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		return BLK_STS_IOERR;
-	}
+	if (blk_queue_io_stat(q))
+		blk_account_io_start(rq, true);
 
 	/*
-	 * Submitting request must be dequeued before calling this function
-	 * because it will be linked to another request_queue
+	 * Since we have a scheduler attached on the top device,
+	 * bypass a potential scheduler on the bottom device for
+	 * insert.
 	 */
-	BUG_ON(blk_queued_rq(rq));
-
-	if (op_is_flush(rq->cmd_flags))
-		where = ELEVATOR_INSERT_FLUSH;
-
-	add_acct_request(q, rq, where);
-	if (where == ELEVATOR_INSERT_FLUSH)
-		__blk_run_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return BLK_STS_OK;
+	return blk_mq_request_issue_directly(rq);
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
@@ -2778,225 +1648,6 @@ void blk_account_io_start(struct request *rq, bool new_io)
 	part_stat_unlock();
 }
 
-static struct request *elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-	struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL);
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	while (1) {
-		list_for_each_entry(rq, &q->queue_head, queuelist) {
-#ifdef CONFIG_PM
-			/*
-			 * If a request gets queued in state RPM_SUSPENDED
-			 * then that's a kernel bug.
-			 */
-			WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED);
-#endif
-			return rq;
-		}
-
-		/*
-		 * Flush request is running and flush request isn't queueable
-		 * in the drive, we can hold the queue till flush request is
-		 * finished. Even we don't do this, driver can't dispatch next
-		 * requests and will requeue them. And this can improve
-		 * throughput too. For example, we have request flush1, write1,
-		 * flush 2. flush1 is dispatched, then queue is hold, write1
-		 * isn't inserted to queue. After flush1 is finished, flush2
-		 * will be dispatched. Since disk cache is already clean,
-		 * flush2 will be finished very soon, so looks like flush2 is
-		 * folded to flush1.
-		 * Since the queue is hold, a flag is set to indicate the queue
-		 * should be restarted later. Please see flush_end_io() for
-		 * details.
-		 */
-		if (fq->flush_pending_idx != fq->flush_running_idx &&
-				!queue_flush_queueable(q)) {
-			fq->flush_queue_delayed = 1;
-			return NULL;
-		}
-		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
-			return NULL;
-	}
-}
-
-/**
- * blk_peek_request - peek at the top of a request queue
- * @q: request queue to peek at
- *
- * Description:
- *     Return the request at the top of @q.  The returned request
- *     should be started using blk_start_request() before LLD starts
- *     processing it.
- *
- * Return:
- *     Pointer to the request at the top of @q if available.  Null
- *     otherwise.
- */
-struct request *blk_peek_request(struct request_queue *q)
-{
-	struct request *rq;
-	int ret;
-
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	while ((rq = elv_next_request(q)) != NULL) {
-		if (!(rq->rq_flags & RQF_STARTED)) {
-			/*
-			 * This is the first time the device driver
-			 * sees this request (possibly after
-			 * requeueing).  Notify IO scheduler.
-			 */
-			if (rq->rq_flags & RQF_SORTED)
-				elv_activate_rq(q, rq);
-
-			/*
-			 * just mark as started even if we don't start
-			 * it, a request that has been delayed should
-			 * not be passed by new incoming requests
-			 */
-			rq->rq_flags |= RQF_STARTED;
-			trace_block_rq_issue(q, rq);
-		}
-
-		if (!q->boundary_rq || q->boundary_rq == rq) {
-			q->end_sector = rq_end_sector(rq);
-			q->boundary_rq = NULL;
-		}
-
-		if (rq->rq_flags & RQF_DONTPREP)
-			break;
-
-		if (q->dma_drain_size && blk_rq_bytes(rq)) {
-			/*
-			 * make sure space for the drain appears we
-			 * know we can do this because max_hw_segments
-			 * has been adjusted to be one fewer than the
-			 * device can handle
-			 */
-			rq->nr_phys_segments++;
-		}
-
-		if (!q->prep_rq_fn)
-			break;
-
-		ret = q->prep_rq_fn(q, rq);
-		if (ret == BLKPREP_OK) {
-			break;
-		} else if (ret == BLKPREP_DEFER) {
-			/*
-			 * the request may have been (partially) prepped.
-			 * we need to keep this request in the front to
-			 * avoid resource deadlock.  RQF_STARTED will
-			 * prevent other fs requests from passing this one.
-			 */
-			if (q->dma_drain_size && blk_rq_bytes(rq) &&
-			    !(rq->rq_flags & RQF_DONTPREP)) {
-				/*
-				 * remove the space for the drain we added
-				 * so that we don't add it again
-				 */
-				--rq->nr_phys_segments;
-			}
-
-			rq = NULL;
-			break;
-		} else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) {
-			rq->rq_flags |= RQF_QUIET;
-			/*
-			 * Mark this request as started so we don't trigger
-			 * any debug logic in the end I/O path.
-			 */
-			blk_start_request(rq);
-			__blk_end_request_all(rq, ret == BLKPREP_INVALID ?
-					BLK_STS_TARGET : BLK_STS_IOERR);
-		} else {
-			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
-			break;
-		}
-	}
-
-	return rq;
-}
-EXPORT_SYMBOL(blk_peek_request);
-
-static void blk_dequeue_request(struct request *rq)
-{
-	struct request_queue *q = rq->q;
-
-	BUG_ON(list_empty(&rq->queuelist));
-	BUG_ON(ELV_ON_HASH(rq));
-
-	list_del_init(&rq->queuelist);
-
-	/*
-	 * the time frame between a request being removed from the lists
-	 * and to it is freed is accounted as io that is in progress at
-	 * the driver side.
-	 */
-	if (blk_account_rq(rq))
-		q->in_flight[rq_is_sync(rq)]++;
-}
-
-/**
- * blk_start_request - start request processing on the driver
- * @req: request to dequeue
- *
- * Description:
- *     Dequeue @req and start timeout timer on it.  This hands off the
- *     request to the driver.
- */
-void blk_start_request(struct request *req)
-{
-	lockdep_assert_held(req->q->queue_lock);
-	WARN_ON_ONCE(req->q->mq_ops);
-
-	blk_dequeue_request(req);
-
-	if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) {
-		req->io_start_time_ns = ktime_get_ns();
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-		req->throtl_size = blk_rq_sectors(req);
-#endif
-		req->rq_flags |= RQF_STATS;
-		rq_qos_issue(req->q, req);
-	}
-
-	BUG_ON(blk_rq_is_complete(req));
-	blk_add_timer(req);
-}
-EXPORT_SYMBOL(blk_start_request);
-
-/**
- * blk_fetch_request - fetch a request from a request queue
- * @q: request queue to fetch a request from
- *
- * Description:
- *     Return the request at the top of @q.  The request is started on
- *     return and LLD can start processing it immediately.
- *
- * Return:
- *     Pointer to the request at the top of @q if available.  Null
- *     otherwise.
- */
-struct request *blk_fetch_request(struct request_queue *q)
-{
-	struct request *rq;
-
-	lockdep_assert_held(q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	rq = blk_peek_request(q);
-	if (rq)
-		blk_start_request(rq);
-	return rq;
-}
-EXPORT_SYMBOL(blk_fetch_request);
-
 /*
  * Steal bios from a request and add them to a bio list.
  * The request must not have been partially completed before.
@@ -3122,252 +1773,6 @@ bool blk_update_request(struct request *req, blk_status_t error,
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-static bool blk_update_bidi_request(struct request *rq, blk_status_t error,
-				    unsigned int nr_bytes,
-				    unsigned int bidi_bytes)
-{
-	if (blk_update_request(rq, error, nr_bytes))
-		return true;
-
-	/* Bidi request must be completed as a whole */
-	if (unlikely(blk_bidi_rq(rq)) &&
-	    blk_update_request(rq->next_rq, error, bidi_bytes))
-		return true;
-
-	if (blk_queue_add_random(rq->q))
-		add_disk_randomness(rq->rq_disk);
-
-	return false;
-}
-
-/**
- * blk_unprep_request - unprepare a request
- * @req:	the request
- *
- * This function makes a request ready for complete resubmission (or
- * completion).  It happens only after all error handling is complete,
- * so represents the appropriate moment to deallocate any resources
- * that were allocated to the request in the prep_rq_fn.  The queue
- * lock is held when calling this.
- */
-void blk_unprep_request(struct request *req)
-{
-	struct request_queue *q = req->q;
-
-	req->rq_flags &= ~RQF_DONTPREP;
-	if (q->unprep_rq_fn)
-		q->unprep_rq_fn(q, req);
-}
-EXPORT_SYMBOL_GPL(blk_unprep_request);
-
-void blk_finish_request(struct request *req, blk_status_t error)
-{
-	struct request_queue *q = req->q;
-	u64 now = ktime_get_ns();
-
-	lockdep_assert_held(req->q->queue_lock);
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (req->rq_flags & RQF_STATS)
-		blk_stat_add(req, now);
-
-	BUG_ON(blk_queued_rq(req));
-
-	if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req))
-		laptop_io_completion(req->q->backing_dev_info);
-
-	blk_delete_timer(req);
-
-	if (req->rq_flags & RQF_DONTPREP)
-		blk_unprep_request(req);
-
-	blk_account_io_done(req, now);
-
-	if (req->end_io) {
-		rq_qos_done(q, req);
-		req->end_io(req, error);
-	} else {
-		if (blk_bidi_rq(req))
-			__blk_put_request(req->next_rq->q, req->next_rq);
-
-		__blk_put_request(q, req);
-	}
-}
-EXPORT_SYMBOL(blk_finish_request);
-
-/**
- * blk_end_bidi_request - Complete a bidi request
- * @rq:         the request to complete
- * @error:      block status code
- * @nr_bytes:   number of bytes to complete @rq
- * @bidi_bytes: number of bytes to complete @rq->next_rq
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *     Drivers that supports bidi can safely call this member for any
- *     type of request, bidi or uni.  In the later case @bidi_bytes is
- *     just ignored.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static bool blk_end_bidi_request(struct request *rq, blk_status_t error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
-{
-	struct request_queue *q = rq->q;
-	unsigned long flags;
-
-	WARN_ON_ONCE(q->mq_ops);
-
-	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
-		return true;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_finish_request(rq, error);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return false;
-}
-
-/**
- * __blk_end_bidi_request - Complete a bidi request with queue lock held
- * @rq:         the request to complete
- * @error:      block status code
- * @nr_bytes:   number of bytes to complete @rq
- * @bidi_bytes: number of bytes to complete @rq->next_rq
- *
- * Description:
- *     Identical to blk_end_bidi_request() except that queue lock is
- *     assumed to be locked on entry and remains so on return.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static bool __blk_end_bidi_request(struct request *rq, blk_status_t error,
-				   unsigned int nr_bytes, unsigned int bidi_bytes)
-{
-	lockdep_assert_held(rq->q->queue_lock);
-	WARN_ON_ONCE(rq->q->mq_ops);
-
-	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
-		return true;
-
-	blk_finish_request(rq, error);
-
-	return false;
-}
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    block status code
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-bool blk_end_request(struct request *rq, blk_status_t error,
-		unsigned int nr_bytes)
-{
-	WARN_ON_ONCE(rq->q->mq_ops);
-	return blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-EXPORT_SYMBOL(blk_end_request);
-
-/**
- * blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @error: block status code
- *
- * Description:
- *     Completely finish @rq.
- */
-void blk_end_request_all(struct request *rq, blk_status_t error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-EXPORT_SYMBOL(blk_end_request_all);
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    block status code
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-bool __blk_end_request(struct request *rq, blk_status_t error,
-		unsigned int nr_bytes)
-{
-	lockdep_assert_held(rq->q->queue_lock);
-	WARN_ON_ONCE(rq->q->mq_ops);
-
-	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-EXPORT_SYMBOL(__blk_end_request);
-
-/**
- * __blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @error:    block status code
- *
- * Description:
- *     Completely finish @rq.  Must be called with queue lock held.
- */
-void __blk_end_request_all(struct request *rq, blk_status_t error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	lockdep_assert_held(rq->q->queue_lock);
-	WARN_ON_ONCE(rq->q->mq_ops);
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-EXPORT_SYMBOL(__blk_end_request_all);
-
-/**
- * __blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @error:    block status code
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.  Must
- *     be called with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-bool __blk_end_request_cur(struct request *rq, blk_status_t error)
-{
-	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-EXPORT_SYMBOL(__blk_end_request_cur);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
@@ -3567,7 +1972,6 @@ void blk_start_plug(struct blk_plug *plug)
 	if (tsk->plug)
 		return;
 
-	INIT_LIST_HEAD(&plug->list);
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
 	/*
@@ -3578,36 +1982,6 @@ void blk_start_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_start_plug);
 
-static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
-	struct request *rqa = container_of(a, struct request, queuelist);
-	struct request *rqb = container_of(b, struct request, queuelist);
-
-	return !(rqa->q < rqb->q ||
-		(rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb)));
-}
-
-/*
- * If 'from_schedule' is true, then postpone the dispatch of requests
- * until a safe kblockd context. We due this to avoid accidental big
- * additional stack usage in driver dispatch, in places where the originally
- * plugger did not intend it.
- */
-static void queue_unplugged(struct request_queue *q, unsigned int depth,
-			    bool from_schedule)
-	__releases(q->queue_lock)
-{
-	lockdep_assert_held(q->queue_lock);
-
-	trace_block_unplug(q, depth, !from_schedule);
-
-	if (from_schedule)
-		blk_run_queue_async(q);
-	else
-		__blk_run_queue(q);
-	spin_unlock_irq(q->queue_lock);
-}
-
 static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
 {
 	LIST_HEAD(callbacks);
@@ -3652,65 +2026,10 @@ EXPORT_SYMBOL(blk_check_plugged);
 
 void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
-	struct request_queue *q;
-	struct request *rq;
-	LIST_HEAD(list);
-	unsigned int depth;
-
 	flush_plug_callbacks(plug, from_schedule);
 
 	if (!list_empty(&plug->mq_list))
 		blk_mq_flush_plug_list(plug, from_schedule);
-
-	if (list_empty(&plug->list))
-		return;
-
-	list_splice_init(&plug->list, &list);
-
-	list_sort(NULL, &list, plug_rq_cmp);
-
-	q = NULL;
-	depth = 0;
-
-	while (!list_empty(&list)) {
-		rq = list_entry_rq(list.next);
-		list_del_init(&rq->queuelist);
-		BUG_ON(!rq->q);
-		if (rq->q != q) {
-			/*
-			 * This drops the queue lock
-			 */
-			if (q)
-				queue_unplugged(q, depth, from_schedule);
-			q = rq->q;
-			depth = 0;
-			spin_lock_irq(q->queue_lock);
-		}
-
-		/*
-		 * Short-circuit if @q is dead
-		 */
-		if (unlikely(blk_queue_dying(q))) {
-			__blk_end_request_all(rq, BLK_STS_IOERR);
-			continue;
-		}
-
-		/*
-		 * rq is already accounted, so use raw insert
-		 */
-		if (op_is_flush(rq->cmd_flags))
-			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
-		else
-			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
-
-		depth++;
-	}
-
-	/*
-	 * This drops the queue lock
-	 */
-	if (q)
-		queue_unplugged(q, depth, from_schedule);
 }
 
 void blk_finish_plug(struct blk_plug *plug)
diff --git a/block/blk-exec.c b/block/blk-exec.c
index f7b292f12449..a34b7d918742 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   struct request *rq, int at_head,
 			   rq_end_io_fn *done)
 {
-	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-
 	WARN_ON(irqs_disabled());
 	WARN_ON(!blk_rq_is_passthrough(rq));
 
@@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * don't check dying flag for MQ because the request won't
 	 * be reused after dying flag is set
 	 */
-	if (q->mq_ops) {
-		blk_mq_sched_insert_request(rq, at_head, true, false);
-		return;
-	}
-
-	spin_lock_irq(q->queue_lock);
-
-	if (unlikely(blk_queue_dying(q))) {
-		rq->rq_flags |= RQF_QUIET;
-		__blk_end_request_all(rq, BLK_STS_IOERR);
-		spin_unlock_irq(q->queue_lock);
-		return;
-	}
-
-	__elv_add_request(q, rq, where);
-	__blk_run_queue(q);
-	spin_unlock_irq(q->queue_lock);
+	blk_mq_sched_insert_request(rq, at_head, true, false);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 01580f88fcb3..391128456aec 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -48,10 +48,8 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->uses_mq && et->ops.mq.exit_icq)
+	if (et->ops.mq.exit_icq)
 		et->ops.mq.exit_icq(icq);
-	else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
-		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
 }
@@ -187,25 +185,13 @@ void put_io_context_active(struct io_context *ioc)
 	 * reverse double locking.  Read comment in ioc_release_fn() for
 	 * explanation on the nested locking annotation.
 	 */
-retry:
 	spin_lock_irqsave_nested(&ioc->lock, flags, 1);
 	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
 		if (icq->flags & ICQ_EXITED)
 			continue;
 
 		et = icq->q->elevator->type;
-		if (et->uses_mq) {
-			ioc_exit_icq(icq);
-		} else {
-			if (spin_trylock(icq->q->queue_lock)) {
-				ioc_exit_icq(icq);
-				spin_unlock(icq->q->queue_lock);
-			} else {
-				spin_unlock_irqrestore(&ioc->lock, flags);
-				cpu_relax();
-				goto retry;
-			}
-		}
+		ioc_exit_icq(icq);
 	}
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
@@ -232,7 +218,7 @@ static void __ioc_clear_queue(struct list_head *icq_list)
 
 	while (!list_empty(icq_list)) {
 		struct io_cq *icq = list_entry(icq_list->next,
-					       struct io_cq, q_node);
+						struct io_cq, q_node);
 		struct io_context *ioc = icq->ioc;
 
 		spin_lock_irqsave(&ioc->lock, flags);
@@ -253,14 +239,9 @@ void ioc_clear_queue(struct request_queue *q)
 
 	spin_lock_irq(q->queue_lock);
 	list_splice_init(&q->icq_list, &icq_list);
+	spin_unlock_irq(q->queue_lock);
 
-	if (q->mq_ops) {
-		spin_unlock_irq(q->queue_lock);
-		__ioc_clear_queue(&icq_list);
-	} else {
-		__ioc_clear_queue(&icq_list);
-		spin_unlock_irq(q->queue_lock);
-	}
+	__ioc_clear_queue(&icq_list);
 }
 
 int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
@@ -415,10 +396,8 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->uses_mq && et->ops.mq.init_icq)
+		if (et->ops.mq.init_icq)
 			et->ops.mq.init_icq(icq);
-		else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
-			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
 		icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 6b5ad275ed56..c068c30b0c35 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -862,13 +862,8 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 			  struct request *next)
 {
-	struct elevator_queue *e = q->elevator;
 	struct request *free;
 
-	if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
-		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
-			return 0;
-
 	free = attempt_merge(q, rq, next);
 	if (free) {
 		__blk_put_request(q, free);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index ac8b8ba4b126..39c3c301a687 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -20,40 +20,6 @@ EXPORT_SYMBOL(blk_max_low_pfn);
 
 unsigned long blk_max_pfn;
 
-/**
- * blk_queue_prep_rq - set a prepare_request function for queue
- * @q:		queue
- * @pfn:	prepare_request function
- *
- * It's possible for a queue to register a prepare_request callback which
- * is invoked before the request is handed to the request_fn. The goal of
- * the function is to prepare a request for I/O, it can be used to build a
- * cdb from the request data for instance.
- *
- */
-void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
-{
-	q->prep_rq_fn = pfn;
-}
-EXPORT_SYMBOL(blk_queue_prep_rq);
-
-/**
- * blk_queue_unprep_rq - set an unprepare_request function for queue
- * @q:		queue
- * @ufn:	unprepare_request function
- *
- * It's possible for a queue to register an unprepare_request callback
- * which is invoked before the request is finally completed. The goal
- * of the function is to deallocate any data that was allocated in the
- * prepare_request callback.
- *
- */
-void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
-{
-	q->unprep_rq_fn = ufn;
-}
-EXPORT_SYMBOL(blk_queue_unprep_rq);
-
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
 	q->softirq_done_fn = fn;
@@ -163,8 +129,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 
 	q->make_request_fn = mfn;
 	blk_queue_dma_alignment(q, 511);
-	blk_queue_congestion_threshold(q);
-	q->nr_batching = BLK_BATCH_REQ;
 
 	blk_set_default_limits(&q->limits);
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1b82ccfde3fe..d4b1b84ba8ca 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	unsigned long nr;
 	int ret, err;
 
-	if (!q->request_fn && !q->mq_ops)
+	if (!q->mq_ops)
 		return -EINVAL;
 
 	ret = queue_var_store(&nr, page, count);
@@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
 
-	if (q->request_fn)
-		err = blk_update_nr_requests(q, nr);
-	else
-		err = blk_mq_update_nr_requests(q, nr);
-
+	err = blk_mq_update_nr_requests(q, nr);
 	if (err)
 		return err;
 
@@ -463,20 +459,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
 	 * ends up either enabling or disabling wbt completely. We can't
 	 * have IO inflight if that happens.
 	 */
-	if (q->mq_ops) {
-		blk_mq_freeze_queue(q);
-		blk_mq_quiesce_queue(q);
-	} else
-		blk_queue_bypass_start(q);
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
 
 	wbt_set_min_lat(q, val);
 	wbt_update_limits(q);
 
-	if (q->mq_ops) {
-		blk_mq_unquiesce_queue(q);
-		blk_mq_unfreeze_queue(q);
-	} else
-		blk_queue_bypass_end(q);
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
 
 	return count;
 }
@@ -847,17 +837,10 @@ static void __blk_release_queue(struct work_struct *work)
 
 	blk_free_queue_stats(q->stats);
 
-	blk_exit_rl(q, &q->root_rl);
-
 	blk_queue_free_zone_bitmaps(q);
 
-	if (!q->mq_ops) {
-		if (q->exit_rq_fn)
-			q->exit_rq_fn(q, q->fq->flush_rq);
-		blk_free_flush_queue(q->fq);
-	} else {
+	if (q->mq_ops)
 		blk_mq_release(q);
-	}
 
 	blk_trace_shutdown(q);
 
@@ -920,7 +903,6 @@ int blk_register_queue(struct gendisk *disk)
 	if (!blk_queue_init_done(q)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
 		percpu_ref_switch_to_percpu(&q->q_usage_counter);
-		blk_queue_bypass_end(q);
 	}
 
 	ret = blk_trace_init_sysfs(dev);
@@ -947,7 +929,7 @@ int blk_register_queue(struct gendisk *disk)
 
 	blk_throtl_register_queue(q);
 
-	if (q->request_fn || (q->mq_ops && q->elevator)) {
+	if ((q->mq_ops && q->elevator)) {
 		ret = elv_register_queue(q);
 		if (ret) {
 			mutex_unlock(&q->sysfs_lock);
@@ -1005,7 +987,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
-	if (q->request_fn || (q->mq_ops && q->elevator))
+	if (q->mq_ops && q->elevator)
 		elv_unregister_queue(q);
 	mutex_unlock(&q->sysfs_lock);
 
diff --git a/block/blk.h b/block/blk.h
index 57a302bf5a70..e2604ae7ddfa 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -7,12 +7,6 @@
 #include <xen/xen.h>
 #include "blk-mq.h"
 
-/* Amount of time in which a process may batch requests */
-#define BLK_BATCH_TIME	(HZ/50UL)
-
-/* Number of requests a "batching" process may submit */
-#define BLK_BATCH_REQ	32
-
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
 
@@ -132,9 +126,6 @@ void blk_exit_rl(struct request_queue *q, struct request_list *rl);
 void blk_exit_queue(struct request_queue *q);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
-void blk_queue_bypass_start(struct request_queue *q);
-void blk_queue_bypass_end(struct request_queue *q);
-void __blk_queue_free_tags(struct request_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 
 static inline void blk_queue_enter_live(struct request_queue *q)
@@ -281,23 +272,6 @@ static inline bool blk_rq_is_complete(struct request *rq)
 
 void blk_insert_flush(struct request *rq);
 
-static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->type->ops.sq.elevator_activate_req_fn)
-		e->type->ops.sq.elevator_activate_req_fn(q, rq);
-}
-
-static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->type->ops.sq.elevator_deactivate_req_fn)
-		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
-}
-
-int elevator_init(struct request_queue *);
 int elevator_init_mq(struct request_queue *q);
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
@@ -332,31 +306,8 @@ void blk_rq_set_mixed_merge(struct request *rq);
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
 enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
 
-void blk_queue_congestion_threshold(struct request_queue *q);
-
 int blk_dev_init(void);
 
-
-/*
- * Return the threshold (number of used requests) at which the queue is
- * considered to be congested.  It include a little hysteresis to keep the
- * context switch rate down.
- */
-static inline int queue_congestion_on_threshold(struct request_queue *q)
-{
-	return q->nr_congestion_on;
-}
-
-/*
- * The threshold at which a queue is considered to be uncongested
- */
-static inline int queue_congestion_off_threshold(struct request_queue *q)
-{
-	return q->nr_congestion_off;
-}
-
-extern int blk_update_nr_requests(struct request_queue *, unsigned int);
-
 /*
  * Contribute to IO statistics IFF:
  *
@@ -478,8 +429,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 }
 #endif /* CONFIG_BOUNCE */
 
-extern void blk_drain_queue(struct request_queue *q);
-
 #ifdef CONFIG_BLK_CGROUP_IOLATENCY
 extern int blk_iolatency_init(struct request_queue *q);
 #else
diff --git a/block/elevator.c b/block/elevator.c
index 54e1adac26c5..334097c54b08 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->uses_mq && e->type->ops.mq.allow_merge)
+	if (e->type->ops.mq.allow_merge)
 		return e->type->ops.mq.allow_merge(q, rq, bio);
-	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
-		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
 }
 
 /*
- * Return scheduler with name 'name' and with matching 'mq capability
+ * Return scheduler with name 'name'
  */
-static struct elevator_type *elevator_find(const char *name, bool mq)
+static struct elevator_type *elevator_find(const char *name)
 {
 	struct elevator_type *e;
 
 	list_for_each_entry(e, &elv_list, list) {
-		if (elevator_match(e, name) && (mq == e->uses_mq))
+		if (elevator_match(e, name))
 			return e;
 	}
 
@@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
 
 	spin_lock(&elv_list_lock);
 
-	e = elevator_find(name, q->mq_ops != NULL);
+	e = elevator_find(name);
 	if (!e && try_loading) {
 		spin_unlock(&elv_list_lock);
 		request_module("%s-iosched", name);
 		spin_lock(&elv_list_lock);
-		e = elevator_find(name, q->mq_ops != NULL);
+		e = elevator_find(name);
 	}
 
 	if (e && !try_module_get(e->elevator_owner))
@@ -150,26 +148,6 @@ static int __init elevator_setup(char *str)
 
 __setup("elevator=", elevator_setup);
 
-/* called during boot to load the elevator chosen by the elevator param */
-void __init load_default_elevator_module(void)
-{
-	struct elevator_type *e;
-
-	if (!chosen_elevator[0])
-		return;
-
-	/*
-	 * Boot parameter is deprecated, we haven't supported that for MQ.
-	 * Only look for non-mq schedulers from here.
-	 */
-	spin_lock(&elv_list_lock);
-	e = elevator_find(chosen_elevator, false);
-	spin_unlock(&elv_list_lock);
-
-	if (!e)
-		request_module("%s-iosched", chosen_elevator);
-}
-
 static struct kobj_type elv_ktype;
 
 struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 	hash_init(eq->hash);
-	eq->uses_mq = e->uses_mq;
 
 	return eq;
 }
@@ -200,52 +177,11 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
 
-/*
- * Use the default elevator specified by config boot param for non-mq devices,
- * or by config option.  Don't try to load modules as we could be running off
- * async and request_module() isn't allowed from async.
- */
-int elevator_init(struct request_queue *q)
-{
-	struct elevator_type *e = NULL;
-	int err = 0;
-
-	/*
-	 * q->sysfs_lock must be held to provide mutual exclusion between
-	 * elevator_switch() and here.
-	 */
-	mutex_lock(&q->sysfs_lock);
-	if (unlikely(q->elevator))
-		goto out_unlock;
-
-	if (*chosen_elevator) {
-		e = elevator_get(q, chosen_elevator, false);
-		if (!e)
-			printk(KERN_ERR "I/O scheduler %s not found\n",
-							chosen_elevator);
-	}
-
-	if (!e) {
-		printk(KERN_ERR
-			"Default I/O scheduler not found. Using noop.\n");
-		e = elevator_get(q, "noop", false);
-	}
-
-	err = e->ops.sq.elevator_init_fn(q, e);
-	if (err)
-		elevator_put(e);
-out_unlock:
-	mutex_unlock(&q->sysfs_lock);
-	return err;
-}
-
 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->uses_mq && e->type->ops.mq.exit_sched)
+	if (e->type->ops.mq.exit_sched)
 		blk_mq_exit_sched(q, e);
-	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
-		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -393,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->uses_mq && e->type->ops.mq.request_merge)
+	if (e->type->ops.mq.request_merge)
 		return e->type->ops.mq.request_merge(q, req, bio);
-	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
-		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -447,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->uses_mq && e->type->ops.mq.request_merged)
+	if (e->type->ops.mq.request_merged)
 		e->type->ops.mq.request_merged(q, rq, type);
-	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
-		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -464,13 +396,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	struct elevator_queue *e = q->elevator;
 	bool next_sorted = false;
 
-	if (e->uses_mq && e->type->ops.mq.requests_merged)
+	if (e->type->ops.mq.requests_merged)
 		e->type->ops.mq.requests_merged(q, rq, next);
-	else if (e->type->ops.sq.elevator_merge_req_fn) {
-		next_sorted = (__force bool)(next->rq_flags & RQF_SORTED);
-		if (next_sorted)
-			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
-	}
 
 	elv_rqhash_reposition(q, rq);
 
@@ -482,156 +409,12 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	q->last_merge = rq;
 }
 
-void elv_bio_merged(struct request_queue *q, struct request *rq,
-			struct bio *bio)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return;
-
-	if (e->type->ops.sq.elevator_bio_merged_fn)
-		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
-}
-
-void elv_requeue_request(struct request_queue *q, struct request *rq)
-{
-	/*
-	 * it already went through dequeue, we need to decrement the
-	 * in_flight count again
-	 */
-	if (blk_account_rq(rq)) {
-		q->in_flight[rq_is_sync(rq)]--;
-		if (rq->rq_flags & RQF_SORTED)
-			elv_deactivate_rq(q, rq);
-	}
-
-	rq->rq_flags &= ~RQF_STARTED;
-
-	blk_pm_requeue_request(rq);
-
-	__elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
-}
-
-void elv_drain_elevator(struct request_queue *q)
-{
-	struct elevator_queue *e = q->elevator;
-	static int printed;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return;
-
-	lockdep_assert_held(q->queue_lock);
-
-	while (e->type->ops.sq.elevator_dispatch_fn(q, 1))
-		;
-	if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) {
-		printk(KERN_ERR "%s: forced dispatching is broken "
-		       "(nr_sorted=%u), please report this\n",
-		       q->elevator->type->elevator_name, q->nr_sorted);
-	}
-}
-
-void __elv_add_request(struct request_queue *q, struct request *rq, int where)
-{
-	trace_block_rq_insert(q, rq);
-
-	blk_pm_add_request(q, rq);
-
-	rq->q = q;
-
-	if (rq->rq_flags & RQF_SOFTBARRIER) {
-		/* barriers are scheduling boundary, update end_sector */
-		if (!blk_rq_is_passthrough(rq)) {
-			q->end_sector = rq_end_sector(rq);
-			q->boundary_rq = rq;
-		}
-	} else if (!(rq->rq_flags & RQF_ELVPRIV) &&
-		    (where == ELEVATOR_INSERT_SORT ||
-		     where == ELEVATOR_INSERT_SORT_MERGE))
-		where = ELEVATOR_INSERT_BACK;
-
-	switch (where) {
-	case ELEVATOR_INSERT_REQUEUE:
-	case ELEVATOR_INSERT_FRONT:
-		rq->rq_flags |= RQF_SOFTBARRIER;
-		list_add(&rq->queuelist, &q->queue_head);
-		break;
-
-	case ELEVATOR_INSERT_BACK:
-		rq->rq_flags |= RQF_SOFTBARRIER;
-		elv_drain_elevator(q);
-		list_add_tail(&rq->queuelist, &q->queue_head);
-		/*
-		 * We kick the queue here for the following reasons.
-		 * - The elevator might have returned NULL previously
-		 *   to delay requests and returned them now.  As the
-		 *   queue wasn't empty before this request, ll_rw_blk
-		 *   won't run the queue on return, resulting in hang.
-		 * - Usually, back inserted requests won't be merged
-		 *   with anything.  There's no point in delaying queue
-		 *   processing.
-		 */
-		__blk_run_queue(q);
-		break;
-
-	case ELEVATOR_INSERT_SORT_MERGE:
-		/*
-		 * If we succeed in merging this request with one in the
-		 * queue already, we are done - rq has now been freed,
-		 * so no need to do anything further.
-		 */
-		if (elv_attempt_insert_merge(q, rq))
-			break;
-		/* fall through */
-	case ELEVATOR_INSERT_SORT:
-		BUG_ON(blk_rq_is_passthrough(rq));
-		rq->rq_flags |= RQF_SORTED;
-		q->nr_sorted++;
-		if (rq_mergeable(rq)) {
-			elv_rqhash_add(q, rq);
-			if (!q->last_merge)
-				q->last_merge = rq;
-		}
-
-		/*
-		 * Some ioscheds (cfq) run q->request_fn directly, so
-		 * rq cannot be accessed after calling
-		 * elevator_add_req_fn.
-		 */
-		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
-		break;
-
-	case ELEVATOR_INSERT_FLUSH:
-		rq->rq_flags |= RQF_SOFTBARRIER;
-		blk_insert_flush(rq);
-		break;
-	default:
-		printk(KERN_ERR "%s: bad insertion point %d\n",
-		       __func__, where);
-		BUG();
-	}
-}
-EXPORT_SYMBOL(__elv_add_request);
-
-void elv_add_request(struct request_queue *q, struct request *rq, int where)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	__elv_add_request(q, rq, where);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(elv_add_request);
-
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->uses_mq && e->type->ops.mq.next_request)
+	if (e->type->ops.mq.next_request)
 		return e->type->ops.mq.next_request(q, rq);
-	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
-		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
 
 	return NULL;
 }
@@ -640,68 +423,12 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->uses_mq && e->type->ops.mq.former_request)
+	if (e->type->ops.mq.former_request)
 		return e->type->ops.mq.former_request(q, rq);
-	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
-		return e->type->ops.sq.elevator_former_req_fn(q, rq);
+
 	return NULL;
 }
 
-int elv_set_request(struct request_queue *q, struct request *rq,
-		    struct bio *bio, gfp_t gfp_mask)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return 0;
-
-	if (e->type->ops.sq.elevator_set_req_fn)
-		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
-	return 0;
-}
-
-void elv_put_request(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return;
-
-	if (e->type->ops.sq.elevator_put_req_fn)
-		e->type->ops.sq.elevator_put_req_fn(rq);
-}
-
-int elv_may_queue(struct request_queue *q, unsigned int op)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return 0;
-
-	if (e->type->ops.sq.elevator_may_queue_fn)
-		return e->type->ops.sq.elevator_may_queue_fn(q, op);
-
-	return ELV_MQUEUE_MAY;
-}
-
-void elv_completed_request(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (WARN_ON_ONCE(e->uses_mq))
-		return;
-
-	/*
-	 * request is released from the driver, io must be done
-	 */
-	if (blk_account_rq(rq)) {
-		q->in_flight[rq_is_sync(rq)]--;
-		if ((rq->rq_flags & RQF_SORTED) &&
-		    e->type->ops.sq.elevator_completed_req_fn)
-			e->type->ops.sq.elevator_completed_req_fn(q, rq);
-	}
-}
-
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
 
 static ssize_t
@@ -768,8 +495,6 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
-			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
 }
@@ -809,7 +534,7 @@ int elv_register(struct elevator_type *e)
 
 	/* register, don't allow duplicate names */
 	spin_lock(&elv_list_lock);
-	if (elevator_find(e->elevator_name, e->uses_mq)) {
+	if (elevator_find(e->elevator_name)) {
 		spin_unlock(&elv_list_lock);
 		kmem_cache_destroy(e->icq_cache);
 		return -EBUSY;
@@ -919,71 +644,17 @@ out_unlock:
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
-	struct elevator_queue *old = q->elevator;
-	bool old_registered = false;
 	int err;
 
 	lockdep_assert_held(&q->sysfs_lock);
 
-	if (q->mq_ops) {
-		blk_mq_freeze_queue(q);
-		blk_mq_quiesce_queue(q);
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
 
-		err = elevator_switch_mq(q, new_e);
+	err = elevator_switch_mq(q, new_e);
 
-		blk_mq_unquiesce_queue(q);
-		blk_mq_unfreeze_queue(q);
-
-		return err;
-	}
-
-	/*
-	 * Turn on BYPASS and drain all requests w/ elevator private data.
-	 * Block layer doesn't call into a quiesced elevator - all requests
-	 * are directly put on the dispatch list without elevator data
-	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
-	 * merge happens either.
-	 */
-	if (old) {
-		old_registered = old->registered;
-
-		blk_queue_bypass_start(q);
-
-		/* unregister and clear all auxiliary data of the old elevator */
-		if (old_registered)
-			elv_unregister_queue(q);
-
-		ioc_clear_queue(q);
-	}
-
-	/* allocate, init and register new elevator */
-	err = new_e->ops.sq.elevator_init_fn(q, new_e);
-	if (err)
-		goto fail_init;
-
-	err = elv_register_queue(q);
-	if (err)
-		goto fail_register;
-
-	/* done, kill the old one and finish */
-	if (old) {
-		elevator_exit(q, old);
-		blk_queue_bypass_end(q);
-	}
-
-	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
-
-	return 0;
-
-fail_register:
-	elevator_exit(q, q->elevator);
-fail_init:
-	/* switch failed, restore and re-register old elevator */
-	if (old) {
-		q->elevator = old;
-		elv_register_queue(q);
-		blk_queue_bypass_end(q);
-	}
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
 
 	return err;
 }
@@ -1032,7 +703,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q))
+	if (!q->mq_ops || !elv_support_iosched(q))
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -1047,7 +718,6 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_queue *e = q->elevator;
 	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
-	bool uses_mq = q->mq_ops != NULL;
 	int len = 0;
 
 	if (!queue_is_rq_based(q))
@@ -1060,14 +730,11 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (elv && elevator_match(elv, __e->elevator_name) &&
-		    (__e->uses_mq == uses_mq)) {
+		if (elv && elevator_match(elv, __e->elevator_name)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 			continue;
 		}
-		if (__e->uses_mq && q->mq_ops && elv_support_iosched(q))
-			len += sprintf(name+len, "%s ", __e->elevator_name);
-		else if (!__e->uses_mq && !q->mq_ops)
+		if (elv_support_iosched(q))
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index eccac01a10b6..728757a34fa0 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1032,7 +1032,6 @@ static struct elevator_type kyber_sched = {
 		.dispatch_request = kyber_dispatch_request,
 		.has_work = kyber_has_work,
 	},
-	.uses_mq = true,
 #ifdef CONFIG_BLK_DEBUG_FS
 	.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
 	.hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 099a9e05854c..513edefd10fd 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -777,7 +777,6 @@ static struct elevator_type mq_deadline = {
 		.exit_sched		= dd_exit_queue,
 	},
 
-	.uses_mq	= true,
 #ifdef CONFIG_BLK_DEBUG_FS
 	.queue_debugfs_attrs = deadline_queue_debugfs_attrs,
 #endif
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8afe3331777e..a9f6db8abcda 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,9 +58,6 @@ struct blk_stat_callback;
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-#define BLK_RL_SYNCFULL		(1U << 0)
-#define BLK_RL_ASYNCFULL	(1U << 1)
-
 struct request_list {
 	struct request_queue	*q;	/* the queue this rl belongs to */
 #ifdef CONFIG_BLK_CGROUP
@@ -309,11 +306,8 @@ static inline unsigned short req_get_ioprio(struct request *req)
 
 struct blk_queue_ctx;
 
-typedef void (request_fn_proc) (struct request_queue *q);
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
-typedef int (prep_rq_fn) (struct request_queue *, struct request *);
-typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
@@ -432,8 +426,6 @@ struct request_queue {
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	struct elevator_queue	*elevator;
-	int			nr_rqs[2];	/* # allocated [a]sync rqs */
-	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
 	struct blk_queue_stats	*stats;
 	struct rq_qos		*rq_qos;
@@ -446,11 +438,8 @@ struct request_queue {
 	 */
 	struct request_list	root_rl;
 
-	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
-	prep_rq_fn		*prep_rq_fn;
-	unprep_rq_fn		*unprep_rq_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
@@ -458,8 +447,6 @@ struct request_queue {
 	init_rq_fn		*init_rq_fn;
 	/* Called just before a request is freed */
 	exit_rq_fn		*exit_rq_fn;
-	/* Called from inside blk_get_request() */
-	void (*initialize_rq_fn)(struct request *rq);
 
 	const struct blk_mq_ops	*mq_ops;
 
@@ -475,17 +462,6 @@ struct request_queue {
 	struct blk_mq_hw_ctx	**queue_hw_ctx;
 	unsigned int		nr_hw_queues;
 
-	/*
-	 * Dispatch queue sorting
-	 */
-	sector_t		end_sector;
-	struct request		*boundary_rq;
-
-	/*
-	 * Delayed queue handling
-	 */
-	struct delayed_work	delay_work;
-
 	struct backing_dev_info	*backing_dev_info;
 
 	/*
@@ -548,9 +524,6 @@ struct request_queue {
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
-	unsigned int		nr_congestion_on;
-	unsigned int		nr_congestion_off;
-	unsigned int		nr_batching;
 
 	unsigned int		dma_drain_size;
 	void			*dma_drain_buffer;
@@ -560,13 +533,6 @@ struct request_queue {
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 
-	/*
-	 * Number of active block driver functions for which blk_drain_queue()
-	 * must wait. Must be incremented around functions that unlock the
-	 * queue_lock internally, e.g. scsi_request_fn().
-	 */
-	unsigned int		request_fn_active;
-
 	unsigned int		rq_timeout;
 	int			poll_nsec;
 
@@ -740,11 +706,6 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
 
-static inline int queue_in_flight(struct request_queue *q)
-{
-	return q->in_flight[0] + q->in_flight[1];
-}
-
 static inline bool blk_account_rq(struct request *rq)
 {
 	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
@@ -765,7 +726,7 @@ static inline bool blk_account_rq(struct request *rq)
  */
 static inline bool queue_is_rq_based(struct request_queue *q)
 {
-	return q->request_fn || q->mq_ops;
+	return q->mq_ops;
 }
 
 static inline unsigned int blk_queue_cluster(struct request_queue *q)
@@ -828,27 +789,6 @@ static inline bool rq_is_sync(struct request *rq)
 	return op_is_sync(rq->cmd_flags);
 }
 
-static inline bool blk_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	return rl->flags & flag;
-}
-
-static inline void blk_set_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	rl->flags |= flag;
-}
-
-static inline void blk_clear_rl_full(struct request_list *rl, bool sync)
-{
-	unsigned int flag = sync ? BLK_RL_SYNCFULL : BLK_RL_ASYNCFULL;
-
-	rl->flags &= ~flag;
-}
-
 static inline bool rq_mergeable(struct request *rq)
 {
 	if (blk_rq_is_passthrough(rq))
@@ -969,7 +909,6 @@ extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       blk_mq_req_flags_t flags);
-extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
@@ -979,7 +918,6 @@ extern void blk_rq_unprep_clone(struct request *rq);
 extern blk_status_t blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern int blk_rq_append_bio(struct request *rq, struct bio **bio);
-extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_queue_split(struct request_queue *, struct bio **);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
@@ -992,15 +930,7 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);
 extern void blk_queue_exit(struct request_queue *q);
-extern void blk_start_queue(struct request_queue *q);
-extern void blk_start_queue_async(struct request_queue *q);
-extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
-extern void __blk_stop_queue(struct request_queue *q);
-extern void __blk_run_queue(struct request_queue *q);
-extern void __blk_run_queue_uncond(struct request_queue *q);
-extern void blk_run_queue(struct request_queue *);
-extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
@@ -1155,13 +1085,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq)
 	return nr_bios;
 }
 
-/*
- * Request issue related functions.
- */
-extern struct request *blk_peek_request(struct request_queue *q);
-extern void blk_start_request(struct request *rq);
-extern struct request *blk_fetch_request(struct request_queue *q);
-
 void blk_steal_bios(struct bio_list *list, struct request *rq);
 
 /*
@@ -1179,9 +1102,6 @@ void blk_steal_bios(struct bio_list *list, struct request *rq);
  */
 extern bool blk_update_request(struct request *rq, blk_status_t error,
 			       unsigned int nr_bytes);
-extern void blk_finish_request(struct request *rq, blk_status_t error);
-extern bool blk_end_request(struct request *rq, blk_status_t error,
-			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, blk_status_t error);
 extern bool __blk_end_request(struct request *rq, blk_status_t error,
 			      unsigned int nr_bytes);
@@ -1190,15 +1110,10 @@ extern bool __blk_end_request_cur(struct request *rq, blk_status_t error);
 
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-extern void blk_unprep_request(struct request *);
 
 /*
  * Access functions for manipulating queue properties
  */
-extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
-					spinlock_t *lock, int node_id);
-extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
-extern int blk_init_allocated_queue(struct request_queue *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
@@ -1239,8 +1154,6 @@ extern int blk_queue_dma_drain(struct request_queue *q,
 			       void *buf, unsigned int size);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
-extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
-extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
@@ -1298,7 +1211,6 @@ extern void blk_set_queue_dying(struct request_queue *);
  * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
-	struct list_head list; /* requests */
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
 };
@@ -1339,8 +1251,7 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 	struct blk_plug *plug = tsk->plug;
 
 	return plug &&
-		(!list_empty(&plug->list) ||
-		 !list_empty(&plug->mq_list) ||
+		 (!list_empty(&plug->mq_list) ||
 		 !list_empty(&plug->cb_list));
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 015bb59c0331..158004f1754d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -23,74 +23,6 @@ enum elv_merge {
 	ELEVATOR_DISCARD_MERGE	= 3,
 };
 
-typedef enum elv_merge (elevator_merge_fn) (struct request_queue *, struct request **,
-				 struct bio *);
-
-typedef void (elevator_merge_req_fn) (struct request_queue *, struct request *, struct request *);
-
-typedef void (elevator_merged_fn) (struct request_queue *, struct request *, enum elv_merge);
-
-typedef int (elevator_allow_bio_merge_fn) (struct request_queue *,
-					   struct request *, struct bio *);
-
-typedef int (elevator_allow_rq_merge_fn) (struct request_queue *,
-					  struct request *, struct request *);
-
-typedef void (elevator_bio_merged_fn) (struct request_queue *,
-						struct request *, struct bio *);
-
-typedef int (elevator_dispatch_fn) (struct request_queue *, int);
-
-typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
-typedef struct request *(elevator_request_list_fn) (struct request_queue *, struct request *);
-typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
-typedef int (elevator_may_queue_fn) (struct request_queue *, unsigned int);
-
-typedef void (elevator_init_icq_fn) (struct io_cq *);
-typedef void (elevator_exit_icq_fn) (struct io_cq *);
-typedef int (elevator_set_req_fn) (struct request_queue *, struct request *,
-				   struct bio *, gfp_t);
-typedef void (elevator_put_req_fn) (struct request *);
-typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
-typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *);
-
-typedef int (elevator_init_fn) (struct request_queue *,
-				struct elevator_type *e);
-typedef void (elevator_exit_fn) (struct elevator_queue *);
-typedef void (elevator_registered_fn) (struct request_queue *);
-
-struct elevator_ops
-{
-	elevator_merge_fn *elevator_merge_fn;
-	elevator_merged_fn *elevator_merged_fn;
-	elevator_merge_req_fn *elevator_merge_req_fn;
-	elevator_allow_bio_merge_fn *elevator_allow_bio_merge_fn;
-	elevator_allow_rq_merge_fn *elevator_allow_rq_merge_fn;
-	elevator_bio_merged_fn *elevator_bio_merged_fn;
-
-	elevator_dispatch_fn *elevator_dispatch_fn;
-	elevator_add_req_fn *elevator_add_req_fn;
-	elevator_activate_req_fn *elevator_activate_req_fn;
-	elevator_deactivate_req_fn *elevator_deactivate_req_fn;
-
-	elevator_completed_req_fn *elevator_completed_req_fn;
-
-	elevator_request_list_fn *elevator_former_req_fn;
-	elevator_request_list_fn *elevator_latter_req_fn;
-
-	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
-	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */
-
-	elevator_set_req_fn *elevator_set_req_fn;
-	elevator_put_req_fn *elevator_put_req_fn;
-
-	elevator_may_queue_fn *elevator_may_queue_fn;
-
-	elevator_init_fn *elevator_init_fn;
-	elevator_exit_fn *elevator_exit_fn;
-	elevator_registered_fn *elevator_registered_fn;
-};
-
 struct blk_mq_alloc_data;
 struct blk_mq_hw_ctx;
 
@@ -138,16 +70,15 @@ struct elevator_type
 
 	/* fields provided by elevator implementation */
 	union {
-		struct elevator_ops sq;
 		struct elevator_mq_ops mq;
 	} ops;
+
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	const char *elevator_alias;
 	struct module *elevator_owner;
-	bool uses_mq;
 #ifdef CONFIG_BLK_DEBUG_FS
 	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
 	const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
@@ -175,40 +106,25 @@ struct elevator_queue
 	struct kobject kobj;
 	struct mutex sysfs_lock;
 	unsigned int registered:1;
-	unsigned int uses_mq:1;
 	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };
 
 /*
  * block elevator interface
  */
-extern void elv_dispatch_sort(struct request_queue *, struct request *);
-extern void elv_dispatch_add_tail(struct request_queue *, struct request *);
-extern void elv_add_request(struct request_queue *, struct request *, int);
-extern void __elv_add_request(struct request_queue *, struct request *, int);
 extern enum elv_merge elv_merge(struct request_queue *, struct request **,
 		struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *,
 		enum elv_merge);
-extern void elv_bio_merged(struct request_queue *q, struct request *,
-				struct bio *);
 extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
-extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-extern int elv_may_queue(struct request_queue *, unsigned int);
-extern void elv_completed_request(struct request_queue *, struct request *);
-extern int elv_set_request(struct request_queue *q, struct request *rq,
-			   struct bio *bio, gfp_t gfp_mask);
-extern void elv_put_request(struct request_queue *, struct request *);
-extern void elv_drain_elevator(struct request_queue *);
 
 /*
  * io scheduler registration
  */
-extern void __init load_default_elevator_module(void);
 extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
 
@@ -260,9 +176,5 @@ enum {
 #define rq_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_fifo_clear(rq)	list_del_init(&(rq)->queuelist)
 
-#else /* CONFIG_BLOCK */
-
-static inline void load_default_elevator_module(void) { }
-
 #endif /* CONFIG_BLOCK */
 #endif
diff --git a/include/linux/init.h b/include/linux/init.h
index 9c2aba1dbabf..5255069f5a9f 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -146,7 +146,6 @@ extern unsigned int reset_devices;
 /* used by init/main.c */
 void setup_arch(char **);
 void prepare_namespace(void);
-void __init load_default_modules(void);
 int __init init_rootfs(void);
 
 #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_STRICT_MODULE_RWX)
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index d1a5d885ce13..73e02ea5d5d1 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -53,9 +53,6 @@ static void __init handle_initrd(void)
 	ksys_mkdir("/old", 0700);
 	ksys_chdir("/old");
 
-	/* try loading default modules from initrd */
-	load_default_modules();
-
 	/*
 	 * In case that a resume from disk is carried out by linuxrc or one of
 	 * its children, we need to tell the freezer not to wait for us.
diff --git a/init/initramfs.c b/init/initramfs.c
index 640557788026..96af18fec4d0 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -644,12 +644,6 @@ static int __init populate_rootfs(void)
 #endif
 	}
 	flush_delayed_fput();
-	/*
-	 * Try loading default modules from initramfs.  This gives
-	 * us a chance to load before device_initcalls.
-	 */
-	load_default_modules();
-
 	return 0;
 }
 rootfs_initcall(populate_rootfs);
diff --git a/init/main.c b/init/main.c
index ee147103ba1b..ca0cdb0c388b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -996,17 +996,6 @@ static void __init do_pre_smp_initcalls(void)
 		do_one_initcall(initcall_from_entry(fn));
 }
 
-/*
- * This function requests modules which should be loaded by default and is
- * called twice right after initrd is mounted and right before init is
- * exec'd.  If such modules are on either initrd or rootfs, they will be
- * loaded before control is passed to userland.
- */
-void __init load_default_modules(void)
-{
-	load_default_elevator_module();
-}
-
 static int run_init_process(const char *init_filename)
 {
 	argv_init[0] = init_filename;
@@ -1180,5 +1169,4 @@ static noinline void __init kernel_init_freeable(void)
 	 */
 
 	integrity_load_keys();
-	load_default_modules();
 }

From f9cd4bfe96955e7a1d3ec54b393dee87b815ba3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 1 Nov 2018 16:41:41 -0600
Subject: [PATCH 028/351] block: get rid of MQ scheduler ops union

This is a remnant of when we had ops for both SQ and MQ
schedulers. Now it's just MQ, so get rid of the union.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-iosched.c      |  2 +-
 block/blk-ioc.c          |  8 ++++----
 block/blk-mq-sched.c     | 33 ++++++++++++++++-----------------
 block/blk-mq-sched.h     | 20 ++++++++++----------
 block/blk-mq.c           | 12 ++++++------
 block/elevator.c         | 26 +++++++++++++-------------
 block/kyber-iosched.c    |  2 +-
 block/mq-deadline.c      |  2 +-
 include/linux/elevator.h |  4 +---
 9 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 44c7e567aa25..c7636cbefc85 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -5724,7 +5724,7 @@ static struct elv_fs_entry bfq_attrs[] = {
 };
 
 static struct elevator_type iosched_bfq_mq = {
-	.ops.mq = {
+	.ops = {
 		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
 		.requeue_request        = bfq_finish_requeue_request,
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 391128456aec..007aac6e6a4b 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -48,8 +48,8 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.mq.exit_icq)
-		et->ops.mq.exit_icq(icq);
+	if (et->ops.exit_icq)
+		et->ops.exit_icq(icq);
 
 	icq->flags |= ICQ_EXITED;
 }
@@ -396,8 +396,8 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.mq.init_icq)
-			et->ops.mq.init_icq(icq);
+		if (et->ops.init_icq)
+			et->ops.init_icq(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
 		icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 29bfe8017a2d..0feefd6c6aaa 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -85,14 +85,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 	do {
 		struct request *rq;
 
-		if (e->type->ops.mq.has_work &&
-				!e->type->ops.mq.has_work(hctx))
+		if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
 			break;
 
 		if (!blk_mq_get_dispatch_budget(hctx))
 			break;
 
-		rq = e->type->ops.mq.dispatch_request(hctx);
+		rq = e->type->ops.dispatch_request(hctx);
 		if (!rq) {
 			blk_mq_put_dispatch_budget(hctx);
 			break;
@@ -163,7 +162,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct elevator_queue *e = q->elevator;
-	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
+	const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
 	LIST_HEAD(rq_list);
 
 	/* RCU or SRCU read lock is needed before checking quiesced flag */
@@ -314,9 +313,9 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	bool ret = false;
 
-	if (e && e->type->ops.mq.bio_merge) {
+	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
-		return e->type->ops.mq.bio_merge(hctx, bio);
+		return e->type->ops.bio_merge(hctx, bio);
 	}
 
 	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
@@ -380,11 +379,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 	if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
 		goto run;
 
-	if (e && e->type->ops.mq.insert_requests) {
+	if (e && e->type->ops.insert_requests) {
 		LIST_HEAD(list);
 
 		list_add(&rq->queuelist, &list);
-		e->type->ops.mq.insert_requests(hctx, &list, at_head);
+		e->type->ops.insert_requests(hctx, &list, at_head);
 	} else {
 		spin_lock(&ctx->lock);
 		__blk_mq_insert_request(hctx, rq, at_head);
@@ -403,8 +402,8 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	struct elevator_queue *e = hctx->queue->elevator;
 
-	if (e && e->type->ops.mq.insert_requests)
-		e->type->ops.mq.insert_requests(hctx, list, false);
+	if (e && e->type->ops.insert_requests)
+		e->type->ops.insert_requests(hctx, list, false);
 	else {
 		/*
 		 * try to issue requests directly if the hw queue isn't
@@ -489,15 +488,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 			goto err;
 	}
 
-	ret = e->ops.mq.init_sched(q, e);
+	ret = e->ops.init_sched(q, e);
 	if (ret)
 		goto err;
 
 	blk_mq_debugfs_register_sched(q);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		if (e->ops.mq.init_hctx) {
-			ret = e->ops.mq.init_hctx(hctx, i);
+		if (e->ops.init_hctx) {
+			ret = e->ops.init_hctx(hctx, i);
 			if (ret) {
 				eq = q->elevator;
 				blk_mq_exit_sched(q, eq);
@@ -523,14 +522,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		blk_mq_debugfs_unregister_sched_hctx(hctx);
-		if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
-			e->type->ops.mq.exit_hctx(hctx, i);
+		if (e->type->ops.exit_hctx && hctx->sched_data) {
+			e->type->ops.exit_hctx(hctx, i);
 			hctx->sched_data = NULL;
 		}
 	}
 	blk_mq_debugfs_unregister_sched(q);
-	if (e->type->ops.mq.exit_sched)
-		e->type->ops.mq.exit_sched(e);
+	if (e->type->ops.exit_sched)
+		e->type->ops.exit_sched(e);
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
 }
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 8a9544203173..947f236b273d 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -43,8 +43,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e && e->type->ops.mq.allow_merge)
-		return e->type->ops.mq.allow_merge(q, rq, bio);
+	if (e && e->type->ops.allow_merge)
+		return e->type->ops.allow_merge(q, rq, bio);
 
 	return true;
 }
@@ -53,8 +53,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 {
 	struct elevator_queue *e = rq->q->elevator;
 
-	if (e && e->type->ops.mq.completed_request)
-		e->type->ops.mq.completed_request(rq, now);
+	if (e && e->type->ops.completed_request)
+		e->type->ops.completed_request(rq, now);
 }
 
 static inline void blk_mq_sched_started_request(struct request *rq)
@@ -62,8 +62,8 @@ static inline void blk_mq_sched_started_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e && e->type->ops.mq.started_request)
-		e->type->ops.mq.started_request(rq);
+	if (e && e->type->ops.started_request)
+		e->type->ops.started_request(rq);
 }
 
 static inline void blk_mq_sched_requeue_request(struct request *rq)
@@ -71,16 +71,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e && e->type->ops.mq.requeue_request)
-		e->type->ops.mq.requeue_request(rq);
+	if (e && e->type->ops.requeue_request)
+		e->type->ops.requeue_request(rq);
 }
 
 static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
 {
 	struct elevator_queue *e = hctx->queue->elevator;
 
-	if (e && e->type->ops.mq.has_work)
-		return e->type->ops.mq.has_work(hctx);
+	if (e && e->type->ops.has_work)
+		return e->type->ops.has_work(hctx);
 
 	return false;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a58d2d953876..d106d7a970cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -363,9 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		 * dispatch list. Don't include reserved tags in the
 		 * limiting, as it isn't useful.
 		 */
-		if (!op_is_flush(op) && e->type->ops.mq.limit_depth &&
+		if (!op_is_flush(op) && e->type->ops.limit_depth &&
 		    !(data->flags & BLK_MQ_REQ_RESERVED))
-			e->type->ops.mq.limit_depth(op, data);
+			e->type->ops.limit_depth(op, data);
 	} else {
 		blk_mq_tag_busy(data->hctx);
 	}
@@ -383,11 +383,11 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	rq = blk_mq_rq_ctx_init(data, tag, op);
 	if (!op_is_flush(op)) {
 		rq->elv.icq = NULL;
-		if (e && e->type->ops.mq.prepare_request) {
+		if (e && e->type->ops.prepare_request) {
 			if (e->type->icq_cache && rq_ioc(bio))
 				blk_mq_sched_assign_ioc(rq, bio);
 
-			e->type->ops.mq.prepare_request(rq, bio);
+			e->type->ops.prepare_request(rq, bio);
 			rq->rq_flags |= RQF_ELVPRIV;
 		}
 	}
@@ -491,8 +491,8 @@ void blk_mq_free_request(struct request *rq)
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
-		if (e && e->type->ops.mq.finish_request)
-			e->type->ops.mq.finish_request(rq);
+		if (e && e->type->ops.finish_request)
+			e->type->ops.finish_request(rq);
 		if (rq->elv.icq) {
 			put_io_context(rq->elv.icq->ioc);
 			rq->elv.icq = NULL;
diff --git a/block/elevator.c b/block/elevator.c
index 334097c54b08..19351ffa56b1 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -61,8 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.mq.allow_merge)
-		return e->type->ops.mq.allow_merge(q, rq, bio);
+	if (e->type->ops.allow_merge)
+		return e->type->ops.allow_merge(q, rq, bio);
 
 	return 1;
 }
@@ -180,7 +180,7 @@ static void elevator_release(struct kobject *kobj)
 void elevator_exit(struct request_queue *q, struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.mq.exit_sched)
+	if (e->type->ops.exit_sched)
 		blk_mq_exit_sched(q, e);
 	mutex_unlock(&e->sysfs_lock);
 
@@ -329,8 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.mq.request_merge)
-		return e->type->ops.mq.request_merge(q, req, bio);
+	if (e->type->ops.request_merge)
+		return e->type->ops.request_merge(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -381,8 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.mq.request_merged)
-		e->type->ops.mq.request_merged(q, rq, type);
+	if (e->type->ops.request_merged)
+		e->type->ops.request_merged(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -396,8 +396,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	struct elevator_queue *e = q->elevator;
 	bool next_sorted = false;
 
-	if (e->type->ops.mq.requests_merged)
-		e->type->ops.mq.requests_merged(q, rq, next);
+	if (e->type->ops.requests_merged)
+		e->type->ops.requests_merged(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
 
@@ -413,8 +413,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.mq.next_request)
-		return e->type->ops.mq.next_request(q, rq);
+	if (e->type->ops.next_request)
+		return e->type->ops.next_request(q, rq);
 
 	return NULL;
 }
@@ -423,8 +423,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.mq.former_request)
-		return e->type->ops.mq.former_request(q, rq);
+	if (e->type->ops.former_request)
+		return e->type->ops.former_request(q, rq);
 
 	return NULL;
 }
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 728757a34fa0..1fd83a91e749 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -1017,7 +1017,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
 #endif
 
 static struct elevator_type kyber_sched = {
-	.ops.mq = {
+	.ops = {
 		.init_sched = kyber_init_sched,
 		.exit_sched = kyber_exit_sched,
 		.init_hctx = kyber_init_hctx,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 513edefd10fd..1bd06cefce57 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -761,7 +761,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
 #endif
 
 static struct elevator_type mq_deadline = {
-	.ops.mq = {
+	.ops = {
 		.insert_requests	= dd_insert_requests,
 		.dispatch_request	= dd_dispatch_request,
 		.prepare_request	= dd_prepare_request,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 158004f1754d..2e9e2763bf47 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -69,9 +69,7 @@ struct elevator_type
 	struct kmem_cache *icq_cache;
 
 	/* fields provided by elevator implementation */
-	union {
-		struct elevator_mq_ops mq;
-	} ops;
+	struct elevator_mq_ops ops;
 
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */

From 92bc5a24844ada9b010f03c49a493e3edeadaa54 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 13:52:28 -0600
Subject: [PATCH 029/351] block: remove __blk_put_request()

Now there's no difference between blk_put_request() and
__blk_put_request() anymore, get rid of the underscore version and
convert the few callers.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                   | 9 ---------
 block/blk-merge.c                  | 2 +-
 drivers/scsi/osd/osd_initiator.c   | 4 ++--
 drivers/scsi/osst.c                | 2 +-
 drivers/scsi/scsi_error.c          | 2 +-
 drivers/scsi/sg.c                  | 2 +-
 drivers/scsi/st.c                  | 2 +-
 drivers/target/target_core_pscsi.c | 2 +-
 include/linux/blkdev.h             | 1 -
 9 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 18538a41a532..700dd4587282 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -803,15 +803,6 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 
-void __blk_put_request(struct request_queue *q, struct request *req)
-{
-	if (unlikely(!q))
-		return;
-
-	blk_mq_free_request(req);
-}
-EXPORT_SYMBOL_GPL(__blk_put_request);
-
 void blk_put_request(struct request *req)
 {
 	blk_mq_free_request(req);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c068c30b0c35..3d073305da33 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -866,7 +866,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 
 	free = attempt_merge(q, rq, next);
 	if (free) {
-		__blk_put_request(q, free);
+		blk_put_request(free);
 		return 1;
 	}
 
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index e19fa883376f..60cf7c5eb880 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -506,11 +506,11 @@ static void osd_request_async_done(struct request *req, blk_status_t error)
 
 	_set_error_resid(or, req, error);
 	if (req->next_rq) {
-		__blk_put_request(req->q, req->next_rq);
+		blk_put_request(req->next_rq);
 		req->next_rq = NULL;
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 	or->request = NULL;
 	or->in.req = NULL;
 	or->out.req = NULL;
diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c
index 7a1a1edde35d..664c1238a87f 100644
--- a/drivers/scsi/osst.c
+++ b/drivers/scsi/osst.c
@@ -341,7 +341,7 @@ static void osst_end_async(struct request *req, blk_status_t status)
 		blk_rq_unmap_user(SRpnt->bio);
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 /* osst_request memory management */
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index fff128aa9ec2..dd338a8cd275 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -1932,7 +1932,7 @@ maybe_retry:
 
 static void eh_lock_door_done(struct request *req, blk_status_t status)
 {
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 /**
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index c6ad00703c5b..4e27460ec926 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1390,7 +1390,7 @@ sg_rq_end_io(struct request *rq, blk_status_t status)
 	 */
 	srp->rq = NULL;
 	scsi_req_free_cmd(scsi_req(rq));
-	__blk_put_request(rq->q, rq);
+	blk_put_request(rq);
 
 	write_lock_irqsave(&sfp->rq_list_lock, iflags);
 	if (unlikely(srp->orphan)) {
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 307df2fa39a3..7ff22d3f03e3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -530,7 +530,7 @@ static void st_scsi_execute_end(struct request *req, blk_status_t status)
 		complete(SRpnt->waiting);
 
 	blk_rq_unmap_user(tmp);
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 }
 
 static int st_scsi_execute(struct st_request *SRpnt, const unsigned char *cmd,
diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c
index 47d76c862014..c062d363dce3 100644
--- a/drivers/target/target_core_pscsi.c
+++ b/drivers/target/target_core_pscsi.c
@@ -1094,7 +1094,7 @@ static void pscsi_req_done(struct request *req, blk_status_t status)
 		break;
 	}
 
-	__blk_put_request(req->q, req);
+	blk_put_request(req);
 	kfree(pt);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index a9f6db8abcda..c502a7f40e84 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -906,7 +906,6 @@ extern blk_qc_t direct_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_init_request_from_bio(struct request *req, struct bio *bio);
 extern void blk_put_request(struct request *);
-extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, unsigned int op,
 				       blk_mq_req_flags_t flags);
 extern int blk_lld_busy(struct request_queue *q);

From 4316b79e4321d4140164e42f228778e5bc66c84f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 10:25:07 -0600
Subject: [PATCH 030/351] block: kill legacy parts of timeout handling

The only user of legacy timing now is BSG, which is invoked
from the mq timeout handler. Kill the legacy code, and rename
the q->rq_timed_out_fn to q->bsg_job_timeout_fn.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  1 -
 block/blk-settings.c   |  7 ---
 block/blk-timeout.c    | 99 +++---------------------------------------
 block/blk.h            |  1 -
 block/bsg-lib.c        |  6 +--
 include/linux/blkdev.h |  4 +-
 6 files changed, 11 insertions(+), 107 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 700dd4587282..ccfe2a65cc22 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -656,7 +656,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 		    laptop_mode_timer_fn, 0);
 	timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
 	INIT_WORK(&q->timeout_work, NULL);
-	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->icq_list);
 #ifdef CONFIG_BLK_CGROUP
 	INIT_LIST_HEAD(&q->blkg_list);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 39c3c301a687..e3f07d94b18d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -32,13 +32,6 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
 
-void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
-{
-	WARN_ON_ONCE(q->mq_ops);
-	q->rq_timed_out_fn = fn;
-}
-EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
-
 /**
  * blk_set_default_limits - reset limits to default values
  * @lim:  the queue_limits structure to reset
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index f2cfd56e1606..6428d458072a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -78,70 +78,6 @@ void blk_delete_timer(struct request *req)
 	list_del_init(&req->timeout_list);
 }
 
-static void blk_rq_timed_out(struct request *req)
-{
-	struct request_queue *q = req->q;
-	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
-
-	if (q->rq_timed_out_fn)
-		ret = q->rq_timed_out_fn(req);
-	switch (ret) {
-	case BLK_EH_RESET_TIMER:
-		blk_add_timer(req);
-		blk_clear_rq_complete(req);
-		break;
-	case BLK_EH_DONE:
-		/*
-		 * LLD handles this for now but in the future
-		 * we can send a request msg to abort the command
-		 * and we can move more of the generic scsi eh code to
-		 * the blk layer.
-		 */
-		break;
-	default:
-		printk(KERN_ERR "block: bad eh return: %d\n", ret);
-		break;
-	}
-}
-
-static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout,
-			  unsigned int *next_set)
-{
-	const unsigned long deadline = blk_rq_deadline(rq);
-
-	if (time_after_eq(jiffies, deadline)) {
-		list_del_init(&rq->timeout_list);
-
-		/*
-		 * Check if we raced with end io completion
-		 */
-		if (!blk_mark_rq_complete(rq))
-			blk_rq_timed_out(rq);
-	} else if (!*next_set || time_after(*next_timeout, deadline)) {
-		*next_timeout = deadline;
-		*next_set = 1;
-	}
-}
-
-void blk_timeout_work(struct work_struct *work)
-{
-	struct request_queue *q =
-		container_of(work, struct request_queue, timeout_work);
-	unsigned long flags, next = 0;
-	struct request *rq, *tmp;
-	int next_set = 0;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list)
-		blk_rq_check_expired(rq, &next, &next_set);
-
-	if (next_set)
-		mod_timer(&q->timeout, round_jiffies_up(next));
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-
 /**
  * blk_abort_request -- Request request recovery for the specified command
  * @req:	pointer to the request of interest
@@ -153,20 +89,13 @@ void blk_timeout_work(struct work_struct *work)
  */
 void blk_abort_request(struct request *req)
 {
-	if (req->q->mq_ops) {
-		/*
-		 * All we need to ensure is that timeout scan takes place
-		 * immediately and that scan sees the new timeout value.
-		 * No need for fancy synchronizations.
-		 */
-		blk_rq_set_deadline(req, jiffies);
-		kblockd_schedule_work(&req->q->timeout_work);
-	} else {
-		if (blk_mark_rq_complete(req))
-			return;
-		blk_delete_timer(req);
-		blk_rq_timed_out(req);
-	}
+	/*
+	 * All we need to ensure is that timeout scan takes place
+	 * immediately and that scan sees the new timeout value.
+	 * No need for fancy synchronizations.
+	 */
+	blk_rq_set_deadline(req, jiffies);
+	kblockd_schedule_work(&req->q->timeout_work);
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
 
@@ -194,13 +123,6 @@ void blk_add_timer(struct request *req)
 	struct request_queue *q = req->q;
 	unsigned long expiry;
 
-	if (!q->mq_ops)
-		lockdep_assert_held(q->queue_lock);
-
-	/* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */
-	if (!q->mq_ops && !q->rq_timed_out_fn)
-		return;
-
 	BUG_ON(!list_empty(&req->timeout_list));
 
 	/*
@@ -213,13 +135,6 @@ void blk_add_timer(struct request *req)
 	req->rq_flags &= ~RQF_TIMED_OUT;
 	blk_rq_set_deadline(req, jiffies + req->timeout);
 
-	/*
-	 * Only the non-mq case needs to add the request to a protected list.
-	 * For the mq case we simply scan the tag map.
-	 */
-	if (!q->mq_ops)
-		list_add_tail(&req->timeout_list, &req->q->timeout_list);
-
 	/*
 	 * If the timer isn't already pending or this timeout is earlier
 	 * than an existing one, modify the timer. Round up to next nearest
diff --git a/block/blk.h b/block/blk.h
index e2604ae7ddfa..4ae6cacb4548 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -224,7 +224,6 @@ static inline bool bio_integrity_endio(struct bio *bio)
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-void blk_timeout_work(struct work_struct *work);
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 void blk_delete_timer(struct request *);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index faf20f4500c9..f38c7bc272c0 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -307,8 +307,8 @@ static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
 	enum blk_eh_timer_return ret = BLK_EH_DONE;
 	struct request_queue *q = rq->q;
 
-	if (q->rq_timed_out_fn)
-		ret = q->rq_timed_out_fn(rq);
+	if (q->bsg_job_timeout_fn)
+		ret = q->bsg_job_timeout_fn(rq);
 
 	return ret;
 }
@@ -357,9 +357,9 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 
 	q->queuedata = dev;
 	q->bsg_job_fn = job_fn;
+	q->bsg_job_timeout_fn = timeout;
 	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
-	q->rq_timed_out_fn = timeout;
 
 	ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
 	if (ret) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c502a7f40e84..0364fc53f5c8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -441,7 +441,6 @@ struct request_queue {
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
 	softirq_done_fn		*softirq_done_fn;
-	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	/* Called just after a request is allocated */
 	init_rq_fn		*init_rq_fn;
@@ -541,7 +540,6 @@ struct request_queue {
 
 	struct timer_list	timeout;
 	struct work_struct	timeout_work;
-	struct list_head	timeout_list;
 
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
@@ -601,6 +599,7 @@ struct request_queue {
 
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
+	rq_timed_out_fn		*bsg_job_timeout_fn;
 	struct bsg_class_device bsg_dev;
 #endif
 
@@ -1156,7 +1155,6 @@ extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
-extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);

From 1028e4b335665290dc563d5272f3c6b84e7fd66e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 09:47:17 -0600
Subject: [PATCH 031/351] bsg: move bsg-lib parts outside of request queue

Get rid of the special bsg job fn and timeout handler, move them
into a private bsg_set instead.

Mostly from Christoph, with fixes for error handling and cleanups.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bsg-lib.c         | 43 ++++++++++++++++++++++++++---------------
 include/linux/blkdev.h  |  5 -----
 include/linux/bsg-lib.h |  5 ++++-
 3 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f38c7bc272c0..192129856342 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -31,6 +31,12 @@
 
 #define uptr64(val) ((void __user *)(uintptr_t)(val))
 
+struct bsg_set {
+	struct blk_mq_tag_set	tag_set;
+	bsg_job_fn		*job_fn;
+	bsg_timeout_fn		*timeout_fn;
+};
+
 static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
 {
 	if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
@@ -239,6 +245,8 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request_queue *q = hctx->queue;
 	struct device *dev = q->queuedata;
 	struct request *req = bd->rq;
+	struct bsg_set *bset =
+		container_of(q->tag_set, struct bsg_set, tag_set);
 	int ret;
 
 	blk_mq_start_request(req);
@@ -249,7 +257,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!bsg_prepare_job(dev, req))
 		return BLK_STS_IOERR;
 
-	ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req));
+	ret = bset->job_fn(blk_mq_rq_to_pdu(req));
 	if (ret)
 		return BLK_STS_IOERR;
 
@@ -292,25 +300,25 @@ static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
 void bsg_remove_queue(struct request_queue *q)
 {
 	if (q) {
-		struct blk_mq_tag_set *set = q->tag_set;
+		struct bsg_set *bset =
+			container_of(q->tag_set, struct bsg_set, tag_set);
 
 		bsg_unregister_queue(q);
 		blk_cleanup_queue(q);
-		blk_mq_free_tag_set(set);
-		kfree(set);
+		blk_mq_free_tag_set(&bset->tag_set);
+		kfree(bset);
 	}
 }
 EXPORT_SYMBOL_GPL(bsg_remove_queue);
 
 static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
 {
-	enum blk_eh_timer_return ret = BLK_EH_DONE;
-	struct request_queue *q = rq->q;
+	struct bsg_set *bset =
+		container_of(rq->q->tag_set, struct bsg_set, tag_set);
 
-	if (q->bsg_job_timeout_fn)
-		ret = q->bsg_job_timeout_fn(rq);
-
-	return ret;
+	if (!bset->timeout_fn)
+		return BLK_EH_DONE;
+	return bset->timeout_fn(rq);
 }
 
 static const struct blk_mq_ops bsg_mq_ops = {
@@ -330,16 +338,21 @@ static const struct blk_mq_ops bsg_mq_ops = {
  * @dd_job_size: size of LLD data needed for each job
  */
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size)
+		bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size)
 {
+	struct bsg_set *bset;
 	struct blk_mq_tag_set *set;
 	struct request_queue *q;
 	int ret = -ENOMEM;
 
-	set = kzalloc(sizeof(*set), GFP_KERNEL);
-	if (!set)
+	bset = kzalloc(sizeof(*bset), GFP_KERNEL);
+	if (!bset)
 		return ERR_PTR(-ENOMEM);
 
+	bset->job_fn = job_fn;
+	bset->timeout_fn = timeout;
+
+	set = &bset->tag_set;
 	set->ops = &bsg_mq_ops,
 	set->nr_hw_queues = 1;
 	set->queue_depth = 128;
@@ -356,8 +369,6 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
 	}
 
 	q->queuedata = dev;
-	q->bsg_job_fn = job_fn;
-	q->bsg_job_timeout_fn = timeout;
 	blk_queue_flag_set(QUEUE_FLAG_BIDI, q);
 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
 
@@ -374,7 +385,7 @@ out_cleanup_queue:
 out_queue:
 	blk_mq_free_tag_set(set);
 out_tag_set:
-	kfree(set);
+	kfree(bset);
 	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(bsg_setup_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0364fc53f5c8..877a3d235c45 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -312,7 +312,6 @@ typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (bsg_job_fn) (struct bsg_job *);
 typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
 typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
@@ -321,8 +320,6 @@ enum blk_eh_timer_return {
 	BLK_EH_RESET_TIMER,	/* reset timer and try again */
 };
 
-typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
-
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
@@ -598,8 +595,6 @@ struct request_queue {
 	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
-	bsg_job_fn		*bsg_job_fn;
-	rq_timed_out_fn		*bsg_job_timeout_fn;
 	struct bsg_class_device bsg_dev;
 #endif
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
index 9c9b134b1fa5..b356e0006731 100644
--- a/include/linux/bsg-lib.h
+++ b/include/linux/bsg-lib.h
@@ -31,6 +31,9 @@ struct device;
 struct scatterlist;
 struct request_queue;
 
+typedef int (bsg_job_fn) (struct bsg_job *);
+typedef enum blk_eh_timer_return (bsg_timeout_fn)(struct request *);
+
 struct bsg_buffer {
 	unsigned int payload_len;
 	int sg_cnt;
@@ -72,7 +75,7 @@ struct bsg_job {
 void bsg_job_done(struct bsg_job *job, int result,
 		  unsigned int reply_payload_rcv_len);
 struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
-		bsg_job_fn *job_fn, rq_timed_out_fn *timeout, int dd_job_size);
+		bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size);
 void bsg_remove_queue(struct request_queue *q);
 void bsg_job_put(struct bsg_job *job);
 int __must_check bsg_job_get(struct bsg_job *job);

From db6d995235606191fa9db0c717e9d843200b71ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Nov 2018 08:46:15 -0600
Subject: [PATCH 032/351] block: remove request_list code

It's now dead code, nobody uses it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 47 ------------------
 block/blk-core.c           | 75 -----------------------------
 block/blk-mq.c             |  4 --
 block/blk.h                |  3 --
 include/linux/blk-cgroup.h | 97 --------------------------------------
 include/linux/blkdev.h     | 34 -------------
 6 files changed, 260 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 41b2470042d1..6c65791bc3fe 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -76,9 +76,6 @@ static void blkg_free(struct blkcg_gq *blkg)
 		if (blkg->pd[i])
 			blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-	if (blkg->blkcg != &blkcg_root)
-		blk_exit_rl(blkg->q, &blkg->rl);
-
 	blkg_rwstat_exit(&blkg->stat_ios);
 	blkg_rwstat_exit(&blkg->stat_bytes);
 	kfree(blkg);
@@ -112,13 +109,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	blkg->blkcg = blkcg;
 	atomic_set(&blkg->refcnt, 1);
 
-	/* root blkg uses @q->root_rl, init rl only for !root blkgs */
-	if (blkcg != &blkcg_root) {
-		if (blk_init_rl(&blkg->rl, q, gfp_mask))
-			goto err_free;
-		blkg->rl.blkg = blkg;
-	}
-
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
 		struct blkg_policy_data *pd;
@@ -377,7 +367,6 @@ static void blkg_destroy_all(struct request_queue *q)
 	}
 
 	q->root_blkg = NULL;
-	q->root_rl.blkg = NULL;
 }
 
 /*
@@ -403,41 +392,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
 
-/*
- * The next function used by blk_queue_for_each_rl().  It's a bit tricky
- * because the root blkg uses @q->root_rl instead of its own rl.
- */
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-					 struct request_queue *q)
-{
-	struct list_head *ent;
-	struct blkcg_gq *blkg;
-
-	/*
-	 * Determine the current blkg list_head.  The first entry is
-	 * root_rl which is off @q->blkg_list and mapped to the head.
-	 */
-	if (rl == &q->root_rl) {
-		ent = &q->blkg_list;
-		/* There are no more block groups, hence no request lists */
-		if (list_empty(ent))
-			return NULL;
-	} else {
-		blkg = container_of(rl, struct blkcg_gq, rl);
-		ent = &blkg->q_node;
-	}
-
-	/* walk to the next list_head, skip root blkcg */
-	ent = ent->next;
-	if (ent == &q->root_blkg->q_node)
-		ent = ent->next;
-	if (ent == &q->blkg_list)
-		return NULL;
-
-	blkg = container_of(ent, struct blkcg_gq, q_node);
-	return &blkg->rl;
-}
-
 static int blkcg_reset_stats(struct cgroup_subsys_state *css,
 			     struct cftype *cftype, u64 val)
 {
@@ -1230,7 +1184,6 @@ int blkcg_init_queue(struct request_queue *q)
 	if (IS_ERR(blkg))
 		goto err_unlock;
 	q->root_blkg = blkg;
-	q->root_rl.blkg = blkg;
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 
diff --git a/block/blk-core.c b/block/blk-core.c
index ccfe2a65cc22..45f5c5898fd7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -450,81 +450,6 @@ void blk_cleanup_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 
-/* Allocate memory local to the request queue */
-static void *alloc_request_simple(gfp_t gfp_mask, void *data)
-{
-	struct request_queue *q = data;
-
-	return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node);
-}
-
-static void free_request_simple(void *element, void *data)
-{
-	kmem_cache_free(request_cachep, element);
-}
-
-static void *alloc_request_size(gfp_t gfp_mask, void *data)
-{
-	struct request_queue *q = data;
-	struct request *rq;
-
-	rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask,
-			q->node);
-	if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) {
-		kfree(rq);
-		rq = NULL;
-	}
-	return rq;
-}
-
-static void free_request_size(void *element, void *data)
-{
-	struct request_queue *q = data;
-
-	if (q->exit_rq_fn)
-		q->exit_rq_fn(q, element);
-	kfree(element);
-}
-
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
-		gfp_t gfp_mask)
-{
-	if (unlikely(rl->rq_pool) || q->mq_ops)
-		return 0;
-
-	rl->q = q;
-	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
-	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
-	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
-	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
-
-	if (q->cmd_size) {
-		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
-				alloc_request_size, free_request_size,
-				q, gfp_mask, q->node);
-	} else {
-		rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ,
-				alloc_request_simple, free_request_simple,
-				q, gfp_mask, q->node);
-	}
-	if (!rl->rq_pool)
-		return -ENOMEM;
-
-	if (rl != &q->root_rl)
-		WARN_ON_ONCE(!blk_get_queue(q));
-
-	return 0;
-}
-
-void blk_exit_rl(struct request_queue *q, struct request_list *rl)
-{
-	if (rl->rq_pool) {
-		mempool_destroy(rl->rq_pool);
-		if (rl != &q->root_rl)
-			blk_put_queue(q);
-	}
-}
-
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d106d7a970cc..2600cba56408 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -326,10 +326,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 
-#ifdef CONFIG_BLK_CGROUP
-	rq->rl = NULL;
-#endif
-
 	data->ctx->rq_dispatched[op_is_sync(op)]++;
 	refcount_set(&rq->ref, 1);
 	return rq;
diff --git a/block/blk.h b/block/blk.h
index 4ae6cacb4548..e925cf4fe4de 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -120,9 +120,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
 		int node, int cmd_size, gfp_t flags);
 void blk_free_flush_queue(struct blk_flush_queue *q);
 
-int blk_init_rl(struct request_list *rl, struct request_queue *q,
-		gfp_t gfp_mask);
-void blk_exit_rl(struct request_queue *q, struct request_list *rl);
 void blk_exit_queue(struct request_queue *q);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 6d766a19f2bb..1b299e025e83 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -122,9 +122,6 @@ struct blkcg_gq {
 	/* all non-root blkcg_gq's are guaranteed to have access to parent */
 	struct blkcg_gq			*parent;
 
-	/* request allocation list for this blkcg-q pair */
-	struct request_list		rl;
-
 	/* reference count */
 	atomic_t			refcnt;
 
@@ -515,94 +512,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
 					      (p_blkg)->q, false)))
 
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio)
-{
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkcg = bio_blkcg(bio);
-
-	/* bypass blkg lookup and use @q->root_rl directly for root */
-	if (blkcg == &blkcg_root)
-		goto root_rl;
-
-	/*
-	 * Try to use blkg->rl.  blkg lookup may fail under memory pressure
-	 * or if either the blkcg or queue is going away.  Fall back to
-	 * root_rl in such cases.
-	 */
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg))
-		goto root_rl;
-
-	blkg_get(blkg);
-	rcu_read_unlock();
-	return &blkg->rl;
-root_rl:
-	rcu_read_unlock();
-	return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-	if (rl->blkg->blkcg != &blkcg_root)
-		blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-	rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-	return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-					 struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)	\
-	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
 static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
 	int ret;
@@ -939,12 +848,6 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-					      struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 877a3d235c45..e0c661a95c39 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -58,22 +58,6 @@ struct blk_stat_callback;
 
 typedef void (rq_end_io_fn)(struct request *, blk_status_t);
 
-struct request_list {
-	struct request_queue	*q;	/* the queue this rl belongs to */
-#ifdef CONFIG_BLK_CGROUP
-	struct blkcg_gq		*blkg;	/* blkg this request pool belongs to */
-#endif
-	/*
-	 * count[], starved[], and wait[] are indexed by
-	 * BLK_RW_SYNC/BLK_RW_ASYNC
-	 */
-	int			count[2];
-	int			starved[2];
-	mempool_t		*rq_pool;
-	wait_queue_head_t	wait[2];
-	unsigned int		flags;
-};
-
 /*
  * request flags */
 typedef __u32 __bitwise req_flags_t;
@@ -259,10 +243,6 @@ struct request {
 
 	/* for bidi */
 	struct request *next_rq;
-
-#ifdef CONFIG_BLK_CGROUP
-	struct request_list *rl;		/* rl this rq is alloced from */
-#endif
 };
 
 static inline bool blk_op_is_scsi(unsigned int op)
@@ -312,8 +292,6 @@ typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 struct bio_vec;
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
-typedef int (init_rq_fn)(struct request_queue *, struct request *, gfp_t);
-typedef void (exit_rq_fn)(struct request_queue *, struct request *);
 
 enum blk_eh_timer_return {
 	BLK_EH_DONE,		/* drivers has completed the command */
@@ -427,22 +405,10 @@ struct request_queue {
 	struct blk_queue_stats	*stats;
 	struct rq_qos		*rq_qos;
 
-	/*
-	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
-	 * is used, root blkg allocates from @q->root_rl and all other
-	 * blkgs from their own blkg->rl.  Which one to use should be
-	 * determined using bio_request_list().
-	 */
-	struct request_list	root_rl;
-
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
 	softirq_done_fn		*softirq_done_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
-	/* Called just after a request is allocated */
-	init_rq_fn		*init_rq_fn;
-	/* Called just before a request is freed */
-	exit_rq_fn		*exit_rq_fn;
 
 	const struct blk_mq_ops	*mq_ops;
 

From 820efc62fc6ccf07be40a1040d4b793286de8439 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 11 Oct 2018 16:00:52 -0600
Subject: [PATCH 033/351] block: kill request slab cache

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 8 --------
 block/blk.h      | 1 -
 2 files changed, 9 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 45f5c5898fd7..a14dab57ff8b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -57,11 +57,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
 
-/*
- * For the allocated request tables
- */
-struct kmem_cache *request_cachep;
-
 /*
  * For queue allocation
  */
@@ -1971,9 +1966,6 @@ int __init blk_dev_init(void)
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 
-	request_cachep = kmem_cache_create("blkdev_requests",
-			sizeof(struct request), 0, SLAB_PANIC, NULL);
-
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
diff --git a/block/blk.h b/block/blk.h
index e925cf4fe4de..2bf1cfeeb9c0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -32,7 +32,6 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
-extern struct kmem_cache *request_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 

From 2081a56bfaadfbcec479d25c8f3120e2224a745d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 12 Oct 2018 12:39:10 -0600
Subject: [PATCH 034/351] block: remove req_no_special_merge() from merging
 code

It'll always be false at this point, just remove it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 25 +++----------------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3d073305da33..28c92b3098c0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -595,17 +595,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	return ll_new_hw_segment(q, req, bio);
 }
 
-/*
- * blk-mq uses req->special to carry normal driver per-request payload, it
- * does not indicate a prepared command that we cannot merge with.
- */
-static bool req_no_special_merge(struct request *req)
-{
-	struct request_queue *q = req->q;
-
-	return !q->mq_ops && req->special;
-}
-
 static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
 		struct request *next)
 {
@@ -631,13 +620,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	unsigned int seg_size =
 		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
 
-	/*
-	 * First check if the either of the requests are re-queued
-	 * requests.  Can't merge them if they are.
-	 */
-	if (req_no_special_merge(req) || req_no_special_merge(next))
-		return 0;
-
 	if (req_gap_back_merge(req, next->bio))
 		return 0;
 
@@ -757,8 +739,7 @@ static struct request *attempt_merge(struct request_queue *q,
 		return NULL;
 
 	if (rq_data_dir(req) != rq_data_dir(next)
-	    || req->rq_disk != next->rq_disk
-	    || req_no_special_merge(next))
+	    || req->rq_disk != next->rq_disk)
 		return NULL;
 
 	if (req_op(req) == REQ_OP_WRITE_SAME &&
@@ -885,8 +866,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 
-	/* must be same device and not a special request */
-	if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq))
+	/* must be same device */
+	if (rq->rq_disk != bio->bi_disk)
 		return false;
 
 	/* only merge integrity protected bio into ditto rq */

From 8b98a97f80ed0345d254fb645e7c78c40ebed8a6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 20 Oct 2018 15:43:37 -0600
Subject: [PATCH 035/351] blk-merge: kill dead queue lock held check

This is dead code, any queue reaching this part has mq_ops
attached.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 28c92b3098c0..a399b2fa8bc8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -729,9 +729,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next)
 static struct request *attempt_merge(struct request_queue *q,
 				     struct request *req, struct request *next)
 {
-	if (!q->mq_ops)
-		lockdep_assert_held(q->queue_lock);
-
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return NULL;
 

From 7d692330e7cd581ccfee982334bf06b236cb999a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 10:48:12 -0600
Subject: [PATCH 036/351] block: get rid of blk_queued_rq()

No point in hiding what this does, just open code it in the
one spot where we are still using it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 2 +-
 include/linux/blkdev.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2600cba56408..b49f5bd86f42 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -692,7 +692,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 	/* this request will be re-inserted to io scheduler queue */
 	blk_mq_sched_requeue_request(rq);
 
-	BUG_ON(blk_queued_rq(rq));
+	BUG_ON(!list_empty(&rq->queuelist));
 	blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e0c661a95c39..c675e2b5af62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -673,8 +673,6 @@ static inline bool blk_account_rq(struct request *rq)
 
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
-/* rq->queuelist of dequeued request must be list_empty() */
-#define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 

From c7bb9ad1744ea14e61e5fff99ee5282709b0c9d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 09:43:30 -0600
Subject: [PATCH 037/351] block: get rid of q->softirq_done_fn()

With the legacy path gone, all we do is funnel it through the
mq_ops->complete() operation.

Tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 17 ++++++++---------
 block/blk-settings.c   |  6 ------
 block/blk-softirq.c    |  4 ++--
 include/linux/blk-mq.h |  3 ++-
 include/linux/blkdev.h |  3 ---
 5 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b49f5bd86f42..5e7982918c54 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -546,13 +546,15 @@ EXPORT_SYMBOL(blk_mq_end_request);
 static void __blk_mq_complete_request_remote(void *data)
 {
 	struct request *rq = data;
+	struct request_queue *q = rq->q;
 
-	rq->q->softirq_done_fn(rq);
+	q->mq_ops->complete(rq);
 }
 
 static void __blk_mq_complete_request(struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct request_queue *q = rq->q;
 	bool shared = false;
 	int cpu;
 
@@ -568,18 +570,18 @@ static void __blk_mq_complete_request(struct request *rq)
 	 * So complete IO reqeust in softirq context in case of single queue
 	 * for not degrading IO performance by irqsoff latency.
 	 */
-	if (rq->q->nr_hw_queues == 1) {
+	if (q->nr_hw_queues == 1) {
 		__blk_complete_request(rq);
 		return;
 	}
 
-	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
-		rq->q->softirq_done_fn(rq);
+	if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
+		q->mq_ops->complete(rq);
 		return;
 	}
 
 	cpu = get_cpu();
-	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
+	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 		shared = cpus_share_cache(cpu, ctx->cpu);
 
 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
@@ -588,7 +590,7 @@ static void __blk_mq_complete_request(struct request *rq)
 		rq->csd.flags = 0;
 		smp_call_function_single_async(ctx->cpu, &rq->csd);
 	} else {
-		rq->q->softirq_done_fn(rq);
+		q->mq_ops->complete(rq);
 	}
 	put_cpu();
 }
@@ -2701,9 +2703,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	 */
 	q->poll_nsec = -1;
 
-	if (set->ops->complete)
-		blk_queue_softirq_done(q, set->ops->complete);
-
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index e3f07d94b18d..cca83590a1dc 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -20,12 +20,6 @@ EXPORT_SYMBOL(blk_max_low_pfn);
 
 unsigned long blk_max_pfn;
 
-void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
-{
-	q->softirq_done_fn = fn;
-}
-EXPORT_SYMBOL(blk_queue_softirq_done);
-
 void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
 {
 	q->rq_timeout = timeout;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 8ca0f6caf174..727d64436ec4 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
 
 		rq = list_entry(local_list.next, struct request, ipi_list);
 		list_del_init(&rq->ipi_list);
-		rq->q->softirq_done_fn(rq);
+		rq->q->mq_ops->complete(rq);
 	}
 }
 
@@ -102,7 +102,7 @@ void __blk_complete_request(struct request *req)
 	unsigned long flags;
 	bool shared = false;
 
-	BUG_ON(!q->softirq_done_fn);
+	BUG_ON(!q->mq_ops->complete);
 
 	local_irq_save(flags);
 	cpu = smp_processor_id();
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5c8418ebbfd6..9dd574e5436a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -115,6 +115,7 @@ typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
+typedef void (complete_fn)(struct request *);
 
 
 struct blk_mq_ops {
@@ -142,7 +143,7 @@ struct blk_mq_ops {
 	 */
 	poll_fn			*poll;
 
-	softirq_done_fn		*complete;
+	complete_fn		*complete;
 
 	/*
 	 * Called when the block layer side of a hardware queue has been
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c675e2b5af62..d4104844d6bb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -290,7 +290,6 @@ typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
 
 struct bio_vec;
-typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 
 enum blk_eh_timer_return {
@@ -407,7 +406,6 @@ struct request_queue {
 
 	make_request_fn		*make_request_fn;
 	poll_q_fn		*poll_fn;
-	softirq_done_fn		*softirq_done_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
 	const struct blk_mq_ops	*mq_ops;
@@ -1113,7 +1111,6 @@ extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
-extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua);

From 9cf2bab6307659b940da65d16dcc8f82c69f3a97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 17:01:22 -0600
Subject: [PATCH 038/351] block: kill request ->cpu member

This was used for completion placement for the legacy path,
but for mq we have rq->mq_ctx->cpu for that. Add a helper
to get the request CPU assignment, as the mq_ctx type is
private to blk-mq.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c                  | 2 --
 block/blk-merge.c                 | 2 --
 block/blk-mq.c                    | 7 ++++++-
 block/blk-softirq.c               | 2 +-
 drivers/scsi/bnx2i/bnx2i_hwi.c    | 8 +-------
 drivers/scsi/csiostor/csio_scsi.c | 8 +-------
 drivers/scsi/qla2xxx/qla_os.c     | 2 +-
 include/linux/blk-mq.h            | 2 ++
 include/linux/blkdev.h            | 2 --
 9 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a14dab57ff8b..3daab9df24e0 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -145,7 +145,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
-	rq->cpu = -1;
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
@@ -1770,7 +1769,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
  */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
-	dst->cpu = src->cpu;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a399b2fa8bc8..91b2af332a84 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -806,8 +806,6 @@ static struct request *attempt_merge(struct request_queue *q,
 	blk_account_io_merge(next);
 
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
-	if (blk_rq_cpu_valid(next))
-		req->cpu = next->cpu;
 
 	/*
 	 * ownership of bio passed from next to req, return 'next' for
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5e7982918c54..67a2bafd4b29 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -297,7 +297,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
 	rq->rq_flags = rq_flags;
-	rq->cpu = -1;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
@@ -3282,6 +3281,12 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 	return __blk_mq_poll(hctx, rq);
 }
 
+unsigned int blk_mq_rq_cpu(struct request *rq)
+{
+	return rq->mq_ctx->cpu;
+}
+EXPORT_SYMBOL(blk_mq_rq_cpu);
+
 static int __init blk_mq_init(void)
 {
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 727d64436ec4..1534066e306e 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -98,7 +98,7 @@ static int blk_softirq_cpu_dead(unsigned int cpu)
 void __blk_complete_request(struct request *req)
 {
 	struct request_queue *q = req->q;
-	int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu;
+	int cpu, ccpu = req->mq_ctx->cpu;
 	unsigned long flags;
 	bool shared = false;
 
diff --git a/drivers/scsi/bnx2i/bnx2i_hwi.c b/drivers/scsi/bnx2i/bnx2i_hwi.c
index e9e669a6c2bc..6bad2689edd4 100644
--- a/drivers/scsi/bnx2i/bnx2i_hwi.c
+++ b/drivers/scsi/bnx2i/bnx2i_hwi.c
@@ -1906,7 +1906,6 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
 	struct iscsi_task *task;
 	struct scsi_cmnd *sc;
 	int rc = 0;
-	int cpu;
 
 	spin_lock(&session->back_lock);
 	task = iscsi_itt_to_task(bnx2i_conn->cls_conn->dd_data,
@@ -1917,14 +1916,9 @@ static int bnx2i_queue_scsi_cmd_resp(struct iscsi_session *session,
 	}
 	sc = task->sc;
 
-	if (!blk_rq_cpu_valid(sc->request))
-		cpu = smp_processor_id();
-	else
-		cpu = sc->request->cpu;
-
 	spin_unlock(&session->back_lock);
 
-	p = &per_cpu(bnx2i_percpu, cpu);
+	p = &per_cpu(bnx2i_percpu, blk_mq_rq_cpu(sc->request));
 	spin_lock(&p->p_work_lock);
 	if (unlikely(!p->iothread)) {
 		rc = -EINVAL;
diff --git a/drivers/scsi/csiostor/csio_scsi.c b/drivers/scsi/csiostor/csio_scsi.c
index 8c15b7acb4b7..a95debbea0e4 100644
--- a/drivers/scsi/csiostor/csio_scsi.c
+++ b/drivers/scsi/csiostor/csio_scsi.c
@@ -1780,16 +1780,10 @@ csio_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *cmnd)
 	int nsge = 0;
 	int rv = SCSI_MLQUEUE_HOST_BUSY, nr;
 	int retval;
-	int cpu;
 	struct csio_scsi_qset *sqset;
 	struct fc_rport *rport = starget_to_rport(scsi_target(cmnd->device));
 
-	if (!blk_rq_cpu_valid(cmnd->request))
-		cpu = smp_processor_id();
-	else
-		cpu = cmnd->request->cpu;
-
-	sqset = &hw->sqset[ln->portid][cpu];
+	sqset = &hw->sqset[ln->portid][blk_mq_rq_cpu(cmnd->request)];
 
 	nr = fc_remote_port_chkready(rport);
 	if (nr) {
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 4ea9f2b4e04f..29dfd1bd164d 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -1460,7 +1460,7 @@ __qla2xxx_eh_generic_reset(char *name, enum nexus_wait_type type,
 		goto eh_reset_failed;
 	}
 	err = 2;
-	if (do_reset(fcport, cmd->device->lun, cmd->request->cpu + 1)
+	if (do_reset(fcport, cmd->device->lun, blk_mq_rq_cpu(cmd->request) + 1)
 		!= QLA_SUCCESS) {
 		ql_log(ql_log_warn, vha, 0x800c,
 		    "do_reset failed for cmd=%p.\n", cmd);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9dd574e5436a..d83a26fb37e5 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -300,6 +300,8 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
+unsigned int blk_mq_rq_cpu(struct request *rq);
+
 /**
  * blk_mq_mark_complete() - Set request state to complete
  * @rq: request to set to complete state
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index d4104844d6bb..c8fa4d3d7fee 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -130,7 +130,6 @@ struct request {
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
 
-	int cpu;
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;
 
@@ -669,7 +668,6 @@ static inline bool blk_account_rq(struct request *rq)
 	return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq);
 }
 
-#define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)

From a8908939af569ce2419f43fd56eeaf003bc3d85d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 16 Oct 2018 14:23:06 -0600
Subject: [PATCH 039/351] blk-mq: kill q->mq_map

It's just a pointer to set->mq_map, use that instead. Move the
assignment a bit earlier, so we always know it's valid.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 13 ++++---------
 block/blk-mq.h         |  4 +++-
 include/linux/blkdev.h |  2 --
 3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 67a2bafd4b29..766facfa1f08 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2322,7 +2322,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
 	for_each_possible_cpu(i) {
-		hctx_idx = q->mq_map[i];
+		hctx_idx = set->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
 		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2332,7 +2332,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			 * case, remap the current ctx to hctx[0] which
 			 * is guaranteed to always have tags allocated
 			 */
-			q->mq_map[i] = 0;
+			set->mq_map[i] = 0;
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2430,8 +2430,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 				     struct request_queue *q)
 {
-	q->tag_set = set;
-
 	mutex_lock(&set->tag_list_lock);
 
 	/*
@@ -2468,8 +2466,6 @@ void blk_mq_release(struct request_queue *q)
 		kobject_put(&hctx->kobj);
 	}
 
-	q->mq_map = NULL;
-
 	kfree(q->queue_hw_ctx);
 
 	/*
@@ -2589,7 +2585,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		int node;
 		struct blk_mq_hw_ctx *hctx;
 
-		node = blk_mq_hw_queue_to_node(q->mq_map, i);
+		node = blk_mq_hw_queue_to_node(set->mq_map, i);
 		/*
 		 * If the hw queue has been mapped to another numa node,
 		 * we need to realloc the hctx. If allocation fails, fallback
@@ -2666,8 +2662,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->queue_hw_ctx)
 		goto err_percpu;
 
-	q->mq_map = set->mq_map;
-
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
 		goto err_hctxs;
@@ -2676,6 +2670,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
 	q->nr_queues = nr_cpu_ids;
+	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9497b47e2526..9536be06d022 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -75,7 +75,9 @@ extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 		int cpu)
 {
-	return q->queue_hw_ctx[q->mq_map[cpu]];
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	return q->queue_hw_ctx[set->mq_map[cpu]];
 }
 
 /*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c8fa4d3d7fee..2ae7465d68ab 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -409,8 +409,6 @@ struct request_queue {
 
 	const struct blk_mq_ops	*mq_ops;
 
-	unsigned int		*mq_map;
-
 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
 	unsigned int		nr_queues;

From ed76e329d74a4b15ac0f5fd3adbd52ec0178a134 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:06:14 -0600
Subject: [PATCH 040/351] blk-mq: abstract out queue map

This is in preparation for allowing multiple sets of maps per
queue, if so desired.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c                 | 10 ++++----
 block/blk-mq-pci.c                    | 10 ++++----
 block/blk-mq-rdma.c                   |  4 ++--
 block/blk-mq-virtio.c                 |  8 +++----
 block/blk-mq.c                        | 34 ++++++++++++++-------------
 block/blk-mq.h                        |  8 +++----
 drivers/block/virtio_blk.c            |  2 +-
 drivers/nvme/host/pci.c               |  2 +-
 drivers/scsi/qla2xxx/qla_os.c         |  5 ++--
 drivers/scsi/scsi_lib.c               |  2 +-
 drivers/scsi/smartpqi/smartpqi_init.c |  3 ++-
 drivers/scsi/virtio_scsi.c            |  3 ++-
 include/linux/blk-mq-pci.h            |  4 ++--
 include/linux/blk-mq-virtio.h         |  4 ++--
 include/linux/blk-mq.h                | 15 +++++++++---
 15 files changed, 64 insertions(+), 50 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f15842..6e6686c55984 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -30,10 +30,10 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set)
+int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 {
-	unsigned int *map = set->mq_map;
-	unsigned int nr_queues = set->nr_hw_queues;
+	unsigned int *map = qmap->mq_map;
+	unsigned int nr_queues = qmap->nr_queues;
 	unsigned int cpu, first_sibling;
 
 	for_each_possible_cpu(cpu) {
@@ -62,12 +62,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues);
  * We have no quick way of doing reverse lookups. This is only used at
  * queue init time, so runtime isn't important.
  */
-int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index)
+int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
 {
 	int i;
 
 	for_each_possible_cpu(i) {
-		if (index == mq_map[i])
+		if (index == qmap->mq_map[i])
 			return local_memory_node(cpu_to_node(i));
 	}
 
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index db644ec624f5..40333d60a850 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -31,26 +31,26 @@
  * that maps a queue to the CPUs that have irq affinity for the corresponding
  * vector.
  */
-int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
+int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
 			    int offset)
 {
 	const struct cpumask *mask;
 	unsigned int queue, cpu;
 
-	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+	for (queue = 0; queue < qmap->nr_queues; queue++) {
 		mask = pci_irq_get_affinity(pdev, queue + offset);
 		if (!mask)
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			set->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = queue;
 	}
 
 	return 0;
 
 fallback:
-	WARN_ON_ONCE(set->nr_hw_queues > 1);
-	blk_mq_clear_mq_map(set);
+	WARN_ON_ONCE(qmap->nr_queues > 1);
+	blk_mq_clear_mq_map(qmap);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..a71576aff3a5 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -41,12 +41,12 @@ int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			set->mq_map[cpu] = queue;
+			set->map[0].mq_map[cpu] = queue;
 	}
 
 	return 0;
 
 fallback:
-	return blk_mq_map_queues(set);
+	return blk_mq_map_queues(&set->map[0]);
 }
 EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index c3afbca11299..661fbfef480f 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -29,7 +29,7 @@
  * that maps a queue to the CPUs that have irq affinity for the corresponding
  * vector.
  */
-int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 		struct virtio_device *vdev, int first_vec)
 {
 	const struct cpumask *mask;
@@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
 	if (!vdev->config->get_vq_affinity)
 		goto fallback;
 
-	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+	for (queue = 0; queue < qmap->nr_queues; queue++) {
 		mask = vdev->config->get_vq_affinity(vdev, first_vec + queue);
 		if (!mask)
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			set->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = queue;
 	}
 
 	return 0;
 fallback:
-	return blk_mq_map_queues(set);
+	return blk_mq_map_queues(qmap);
 }
 EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 766facfa1f08..fac88d16988b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1975,7 +1975,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	struct blk_mq_tags *tags;
 	int node;
 
-	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+	node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -2031,7 +2031,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	size_t rq_size, left;
 	int node;
 
-	node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
+	node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx);
 	if (node == NUMA_NO_NODE)
 		node = set->numa_node;
 
@@ -2322,7 +2322,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
 	for_each_possible_cpu(i) {
-		hctx_idx = set->mq_map[i];
+		hctx_idx = set->map[0].mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
 		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
@@ -2332,7 +2332,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 			 * case, remap the current ctx to hctx[0] which
 			 * is guaranteed to always have tags allocated
 			 */
-			set->mq_map[i] = 0;
+			set->map[0].mq_map[i] = 0;
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2585,7 +2585,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		int node;
 		struct blk_mq_hw_ctx *hctx;
 
-		node = blk_mq_hw_queue_to_node(set->mq_map, i);
+		node = blk_mq_hw_queue_to_node(&set->map[0], i);
 		/*
 		 * If the hw queue has been mapped to another numa node,
 		 * we need to realloc the hctx. If allocation fails, fallback
@@ -2791,18 +2791,18 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
 		 * 	mask = get_cpu_mask(queue)
 		 * 	for_each_cpu(cpu, mask)
-		 * 		set->mq_map[cpu] = queue;
+		 * 		set->map.mq_map[cpu] = queue;
 		 * }
 		 *
 		 * When we need to remap, the table has to be cleared for
 		 * killing stale mapping since one CPU may not be mapped
 		 * to any hw queue.
 		 */
-		blk_mq_clear_mq_map(set);
+		blk_mq_clear_mq_map(&set->map[0]);
 
 		return set->ops->map_queues(set);
 	} else
-		return blk_mq_map_queues(set);
+		return blk_mq_map_queues(&set->map[0]);
 }
 
 /*
@@ -2857,10 +2857,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		return -ENOMEM;
 
 	ret = -ENOMEM;
-	set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map),
-				   GFP_KERNEL, set->numa_node);
-	if (!set->mq_map)
+	set->map[0].mq_map = kcalloc_node(nr_cpu_ids,
+					  sizeof(*set->map[0].mq_map),
+					  GFP_KERNEL, set->numa_node);
+	if (!set->map[0].mq_map)
 		goto out_free_tags;
+	set->map[0].nr_queues = set->nr_hw_queues;
 
 	ret = blk_mq_update_queue_map(set);
 	if (ret)
@@ -2876,8 +2878,8 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	return 0;
 
 out_free_mq_map:
-	kfree(set->mq_map);
-	set->mq_map = NULL;
+	kfree(set->map[0].mq_map);
+	set->map[0].mq_map = NULL;
 out_free_tags:
 	kfree(set->tags);
 	set->tags = NULL;
@@ -2892,8 +2894,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	for (i = 0; i < nr_cpu_ids; i++)
 		blk_mq_free_map_and_requests(set, i);
 
-	kfree(set->mq_map);
-	set->mq_map = NULL;
+	kfree(set->map[0].mq_map);
+	set->map[0].mq_map = NULL;
 
 	kfree(set->tags);
 	set->tags = NULL;
@@ -3054,7 +3056,7 @@ fallback:
 			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
 					nr_hw_queues, prev_nr_hw_queues);
 			set->nr_hw_queues = prev_nr_hw_queues;
-			blk_mq_map_queues(set);
+			blk_mq_map_queues(&set->map[0]);
 			goto fallback;
 		}
 		blk_mq_map_swqueue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9536be06d022..889f0069dd80 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -70,14 +70,14 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 /*
  * CPU -> queue mappings
  */
-extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
+extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
 
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 		int cpu)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 
-	return q->queue_hw_ctx[set->mq_map[cpu]];
+	return q->queue_hw_ctx[set->map[0].mq_map[cpu]];
 }
 
 /*
@@ -206,12 +206,12 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
 	__blk_mq_put_driver_tag(hctx, rq);
 }
 
-static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set)
+static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 {
 	int cpu;
 
 	for_each_possible_cpu(cpu)
-		set->mq_map[cpu] = 0;
+		qmap->mq_map[cpu] = 0;
 }
 
 #endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 086c6bb12baa..6e869d05f91e 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -624,7 +624,7 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
 {
 	struct virtio_blk *vblk = set->driver_data;
 
-	return blk_mq_virtio_map_queues(set, vblk->vdev, 0);
+	return blk_mq_virtio_map_queues(&set->map[0], vblk->vdev, 0);
 }
 
 #ifdef CONFIG_VIRTIO_BLK_SCSI
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c33bb201b884..49ad854d1b91 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -435,7 +435,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_dev *dev = set->driver_data;
 
-	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
+	return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
 			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
 }
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 29dfd1bd164d..fdf3e52ee908 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -6934,11 +6934,12 @@ static int qla2xxx_map_queues(struct Scsi_Host *shost)
 {
 	int rc;
 	scsi_qla_host_t *vha = (scsi_qla_host_t *)shost->hostdata;
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
 
 	if (USER_CTRL_IRQ(vha->hw))
-		rc = blk_mq_map_queues(&shost->tag_set);
+		rc = blk_mq_map_queues(qmap);
 	else
-		rc = blk_mq_pci_map_queues(&shost->tag_set, vha->hw->pdev, 0);
+		rc = blk_mq_pci_map_queues(qmap, vha->hw->pdev, 0);
 	return rc;
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 651be30ba96a..ed81b8e74cfe 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1812,7 +1812,7 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
 
 	if (shost->hostt->map_queues)
 		return shost->hostt->map_queues(shost);
-	return blk_mq_map_queues(set);
+	return blk_mq_map_queues(&set->map[0]);
 }
 
 void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
diff --git a/drivers/scsi/smartpqi/smartpqi_init.c b/drivers/scsi/smartpqi/smartpqi_init.c
index a25a07a0b7f0..bac084260d80 100644
--- a/drivers/scsi/smartpqi/smartpqi_init.c
+++ b/drivers/scsi/smartpqi/smartpqi_init.c
@@ -5319,7 +5319,8 @@ static int pqi_map_queues(struct Scsi_Host *shost)
 {
 	struct pqi_ctrl_info *ctrl_info = shost_to_hba(shost);
 
-	return blk_mq_pci_map_queues(&shost->tag_set, ctrl_info->pci_dev, 0);
+	return blk_mq_pci_map_queues(&shost->tag_set.map[0],
+					ctrl_info->pci_dev, 0);
 }
 
 static int pqi_getpciinfo_ioctl(struct pqi_ctrl_info *ctrl_info,
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1c72db94270e..c3c95b314286 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -719,8 +719,9 @@ static void virtscsi_target_destroy(struct scsi_target *starget)
 static int virtscsi_map_queues(struct Scsi_Host *shost)
 {
 	struct virtio_scsi *vscsi = shost_priv(shost);
+	struct blk_mq_queue_map *qmap = &shost->tag_set.map[0];
 
-	return blk_mq_virtio_map_queues(&shost->tag_set, vscsi->vdev, 2);
+	return blk_mq_virtio_map_queues(qmap, vscsi->vdev, 2);
 }
 
 /*
diff --git a/include/linux/blk-mq-pci.h b/include/linux/blk-mq-pci.h
index 9f4c17f0d2d8..0b1f45c62623 100644
--- a/include/linux/blk-mq-pci.h
+++ b/include/linux/blk-mq-pci.h
@@ -2,10 +2,10 @@
 #ifndef _LINUX_BLK_MQ_PCI_H
 #define _LINUX_BLK_MQ_PCI_H
 
-struct blk_mq_tag_set;
+struct blk_mq_queue_map;
 struct pci_dev;
 
-int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev,
+int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
 			  int offset);
 
 #endif /* _LINUX_BLK_MQ_PCI_H */
diff --git a/include/linux/blk-mq-virtio.h b/include/linux/blk-mq-virtio.h
index 69b4da262c45..687ae287e1dc 100644
--- a/include/linux/blk-mq-virtio.h
+++ b/include/linux/blk-mq-virtio.h
@@ -2,10 +2,10 @@
 #ifndef _LINUX_BLK_MQ_VIRTIO_H
 #define _LINUX_BLK_MQ_VIRTIO_H
 
-struct blk_mq_tag_set;
+struct blk_mq_queue_map;
 struct virtio_device;
 
-int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 		struct virtio_device *vdev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_VIRTIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d83a26fb37e5..176164888628 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -74,10 +74,19 @@ struct blk_mq_hw_ctx {
 	struct srcu_struct	srcu[0];
 };
 
+struct blk_mq_queue_map {
+	unsigned int *mq_map;
+	unsigned int nr_queues;
+};
+
+enum {
+	HCTX_MAX_TYPES = 1,
+};
+
 struct blk_mq_tag_set {
-	unsigned int		*mq_map;
+	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
 	const struct blk_mq_ops	*ops;
-	unsigned int		nr_hw_queues;
+	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
 	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
 	unsigned int		cmd_size;	/* per-request extra data */
@@ -295,7 +304,7 @@ void blk_mq_freeze_queue_wait(struct request_queue *q);
 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set);
+int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);

From ff2c56609d9b1f0739ae3a3bfdb78191d01e4192 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:07:33 -0600
Subject: [PATCH 041/351] blk-mq: provide dummy blk_mq_map_queue_type() helper

Doesn't do anything right now, but it's needed as a prep patch
to get the interfaces right.

While in there, correct the blk_mq_map_queue() CPU type to an unsigned
int.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 889f0069dd80..d9facfb9ca51 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -73,13 +73,20 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
 
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
-		int cpu)
+						     unsigned int cpu)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 
 	return q->queue_hw_ctx[set->map[0].mq_map[cpu]];
 }
 
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
+							  unsigned int hctx_type,
+							  unsigned int cpu)
+{
+	return blk_mq_map_queue(q, cpu);
+}
+
 /*
  * sysfs helpers
  */

From f9afca4d367b8c915f28d29fcaba7460640403ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:11:38 -0600
Subject: [PATCH 042/351] blk-mq: pass in request/bio flags to queue mapping

Prep patch for being able to place request based not just on
CPU location, but also on the type of request.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c      |  7 +++---
 block/blk-mq-debugfs.c |  4 +++-
 block/blk-mq-sched.c   | 16 ++++++++++----
 block/blk-mq-tag.c     |  5 +++--
 block/blk-mq.c         | 50 +++++++++++++++++++++++-------------------
 block/blk-mq.h         |  6 +++--
 block/blk.h            |  6 ++---
 7 files changed, 57 insertions(+), 37 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 248fe78c2b9b..77e9f5b2ee05 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -215,7 +215,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
-	hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu);
+	hctx = blk_mq_map_queue(q, flush_rq->cmd_flags, flush_rq->mq_ctx->cpu);
 	if (!q->elevator) {
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 		flush_rq->tag = -1;
@@ -301,7 +301,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	if (!q->elevator) {
 		fq->orig_rq = first_rq;
 		flush_rq->tag = first_rq->tag;
-		hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(q, first_rq->cmd_flags,
+					first_rq->mq_ctx->cpu);
 		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
 	} else {
 		flush_rq->internal_tag = first_rq->internal_tag;
@@ -324,7 +325,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 	unsigned long flags;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
-	hctx = blk_mq_map_queue(q, ctx->cpu);
+	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 
 	if (q->elevator) {
 		WARN_ON(rq->tag < 0);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 9ed43a7c70b5..fac70c81b7de 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -427,8 +427,10 @@ struct show_busy_params {
 static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
 {
 	const struct show_busy_params *params = data;
+	struct blk_mq_hw_ctx *hctx;
 
-	if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx)
+	hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
+	if (hctx == params->hctx)
 		__blk_mq_debugfs_rq_show(params->m,
 					 list_entry_rq(&rq->queuelist));
 }
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0feefd6c6aaa..68087bf71a61 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -310,7 +310,7 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
 	bool ret = false;
 
 	if (e && e->type->ops.bio_merge) {
@@ -366,7 +366,9 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 
 	/* flush rq in flush machinery need to be dispatched directly */
 	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
@@ -399,9 +401,15 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async)
 {
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-	struct elevator_queue *e = hctx->queue->elevator;
+	struct blk_mq_hw_ctx *hctx;
+	struct elevator_queue *e;
+	struct request *rq;
 
+	/* For list inserts, requests better be on the same hw queue */
+	rq = list_first_entry(list, struct request, queuelist);
+	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
+
+	e = hctx->queue->elevator;
 	if (e && e->type->ops.insert_requests)
 		e->type->ops.insert_requests(hctx, list, false);
 	else {
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 4254e74c1446..478a959357f5 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -168,7 +168,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		io_schedule();
 
 		data->ctx = blk_mq_get_ctx(data->q);
-		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
+		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
+						data->ctx->cpu);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
 			bt = &tags->breserved_tags;
@@ -530,7 +531,7 @@ u32 blk_mq_unique_tag(struct request *rq)
 	struct blk_mq_hw_ctx *hctx;
 	int hwq = 0;
 
-	hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+	hctx = blk_mq_map_queue(q, rq->cmd_flags, rq->mq_ctx->cpu);
 	hwq = hctx->queue_num;
 
 	return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fac88d16988b..67dec64440dd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -331,8 +331,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 }
 
 static struct request *blk_mq_get_request(struct request_queue *q,
-		struct bio *bio, unsigned int op,
-		struct blk_mq_alloc_data *data)
+					  struct bio *bio,
+					  struct blk_mq_alloc_data *data)
 {
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
@@ -346,8 +346,9 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		put_ctx_on_error = true;
 	}
 	if (likely(!data->hctx))
-		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
-	if (op & REQ_NOWAIT)
+		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
+						data->ctx->cpu);
+	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
 	if (e) {
@@ -358,9 +359,10 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		 * dispatch list. Don't include reserved tags in the
 		 * limiting, as it isn't useful.
 		 */
-		if (!op_is_flush(op) && e->type->ops.limit_depth &&
+		if (!op_is_flush(data->cmd_flags) &&
+		    e->type->ops.limit_depth &&
 		    !(data->flags & BLK_MQ_REQ_RESERVED))
-			e->type->ops.limit_depth(op, data);
+			e->type->ops.limit_depth(data->cmd_flags, data);
 	} else {
 		blk_mq_tag_busy(data->hctx);
 	}
@@ -375,8 +377,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 		return NULL;
 	}
 
-	rq = blk_mq_rq_ctx_init(data, tag, op);
-	if (!op_is_flush(op)) {
+	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
+	if (!op_is_flush(data->cmd_flags)) {
 		rq->elv.icq = NULL;
 		if (e && e->type->ops.prepare_request) {
 			if (e->type->icq_cache && rq_ioc(bio))
@@ -393,7 +395,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags)
 {
-	struct blk_mq_alloc_data alloc_data = { .flags = flags };
+	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 	struct request *rq;
 	int ret;
 
@@ -401,7 +403,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	if (ret)
 		return ERR_PTR(ret);
 
-	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
+	rq = blk_mq_get_request(q, NULL, &alloc_data);
 	blk_queue_exit(q);
 
 	if (!rq)
@@ -419,7 +421,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
-	struct blk_mq_alloc_data alloc_data = { .flags = flags };
+	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
 	struct request *rq;
 	unsigned int cpu;
 	int ret;
@@ -452,7 +454,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
-	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
+	rq = blk_mq_get_request(q, NULL, &alloc_data);
 	blk_queue_exit(q);
 
 	if (!rq)
@@ -466,7 +468,7 @@ static void __blk_mq_free_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 	const int sched_tag = rq->internal_tag;
 
 	blk_pm_mark_last_busy(rq);
@@ -483,7 +485,7 @@ void blk_mq_free_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
 		if (e && e->type->ops.finish_request)
@@ -977,8 +979,9 @@ bool blk_mq_get_driver_tag(struct request *rq)
 {
 	struct blk_mq_alloc_data data = {
 		.q = rq->q,
-		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+		.hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu),
 		.flags = BLK_MQ_REQ_NOWAIT,
+		.cmd_flags = rq->cmd_flags,
 	};
 	bool shared;
 
@@ -1142,7 +1145,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 
 		rq = list_first_entry(list, struct request, queuelist);
 
-		hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+		hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
 		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -1573,7 +1576,8 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
+							ctx->cpu);
 
 	spin_lock(&hctx->lock);
 	list_add_tail(&rq->queuelist, &hctx->dispatch);
@@ -1783,7 +1787,8 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
 	int srcu_idx;
 	blk_qc_t unused_cookie;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
+							ctx->cpu);
 
 	hctx_lock(hctx, &srcu_idx);
 	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
@@ -1817,7 +1822,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
-	struct blk_mq_alloc_data data = { .flags = 0 };
+	struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
 	struct request *rq;
 	unsigned int request_count = 0;
 	struct blk_plug *plug;
@@ -1840,7 +1845,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	rq_qos_throttle(q, bio, NULL);
 
-	rq = blk_mq_get_request(q, bio, bio->bi_opf, &data);
+	rq = blk_mq_get_request(q, bio, &data);
 	if (unlikely(!rq)) {
 		rq_qos_cleanup(q, bio);
 		if (bio->bi_opf & REQ_NOWAIT)
@@ -1909,6 +1914,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 		if (same_queue_rq) {
 			data.hctx = blk_mq_map_queue(q,
+					same_queue_rq->cmd_flags,
 					same_queue_rq->mq_ctx->cpu);
 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
 					&cookie);
@@ -2263,7 +2269,7 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
 		 */
-		hctx = blk_mq_map_queue(q, i);
+		hctx = blk_mq_map_queue_type(q, 0, i);
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
 			hctx->numa_node = local_memory_node(cpu_to_node(i));
 	}
@@ -2336,7 +2342,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
-		hctx = blk_mq_map_queue(q, i);
+		hctx = blk_mq_map_queue_type(q, 0, i);
 
 		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d9facfb9ca51..6a8f8b60d8ba 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -73,6 +73,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
 
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+						     unsigned int flags,
 						     unsigned int cpu)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -84,7 +85,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 							  unsigned int hctx_type,
 							  unsigned int cpu)
 {
-	return blk_mq_map_queue(q, cpu);
+	return blk_mq_map_queue(q, hctx_type, cpu);
 }
 
 /*
@@ -135,6 +136,7 @@ struct blk_mq_alloc_data {
 	struct request_queue *q;
 	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
+	unsigned int cmd_flags;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
@@ -209,7 +211,7 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
 	if (rq->tag == -1 || rq->internal_tag == -1)
 		return;
 
-	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+	hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
 	__blk_mq_put_driver_tag(hctx, rq);
 }
 
diff --git a/block/blk.h b/block/blk.h
index 2bf1cfeeb9c0..78ae94886acf 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -104,10 +104,10 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 	__clear_bit(flag, &q->queue_flags);
 }
 
-static inline struct blk_flush_queue *blk_get_flush_queue(
-		struct request_queue *q, struct blk_mq_ctx *ctx)
+static inline struct blk_flush_queue *
+blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
-	return blk_mq_map_queue(q, ctx->cpu)->fq;
+	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq;
 }
 
 static inline void __blk_get_queue(struct request_queue *q)

From f31967f0e455d08d3ea1d2f849bf62dafc92dbf4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:13:29 -0600
Subject: [PATCH 043/351] blk-mq: allow software queue to map to multiple
 hardware queues

The mapping used to be dependent on just the CPU location, but
now it's a tuple of (type, cpu) instead. This is a prep patch
for allowing a single software queue to map to multiple hardware
queues. No functional changes in this patch.

This changes the software queue count to an unsigned short
to save a bit of space. We can still support 64K-1 CPUs,
which should be enough. Add a check to catch a wrap.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c   |  2 +-
 block/blk-mq.c         | 22 ++++++++++++++++------
 block/blk-mq.h         |  2 +-
 block/kyber-iosched.c  |  6 +++---
 include/linux/blk-mq.h |  3 ++-
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 68087bf71a61..bbabc3877d5a 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -109,7 +109,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
 					  struct blk_mq_ctx *ctx)
 {
-	unsigned idx = ctx->index_hw;
+	unsigned short idx = ctx->index_hw[hctx->type];
 
 	if (++idx == hctx->nr_ctx)
 		idx = 0;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 67dec64440dd..31976bff8ad2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -75,14 +75,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
-	if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
-		sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
+	const int bit = ctx->index_hw[hctx->type];
+
+	if (!sbitmap_test_bit(&hctx->ctx_map, bit))
+		sbitmap_set_bit(&hctx->ctx_map, bit);
 }
 
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 				      struct blk_mq_ctx *ctx)
 {
-	sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
+	const int bit = ctx->index_hw[hctx->type];
+
+	sbitmap_clear_bit(&hctx->ctx_map, bit);
 }
 
 struct mq_inflight {
@@ -955,7 +959,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_ctx *start)
 {
-	unsigned off = start ? start->index_hw : 0;
+	unsigned off = start ? start->index_hw[hctx->type] : 0;
 	struct dispatch_rq_data data = {
 		.hctx = hctx,
 		.rq   = NULL,
@@ -2343,10 +2347,16 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
 		hctx = blk_mq_map_queue_type(q, 0, i);
-
+		hctx->type = 0;
 		cpumask_set_cpu(i, hctx->cpumask);
-		ctx->index_hw = hctx->nr_ctx;
+		ctx->index_hw[hctx->type] = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
+
+		/*
+		 * If the nr_ctx type overflows, we have exceeded the
+		 * amount of sw queues we can support.
+		 */
+		BUG_ON(!hctx->nr_ctx);
 	}
 
 	mutex_unlock(&q->sysfs_lock);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 6a8f8b60d8ba..1821f448f7c4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -17,7 +17,7 @@ struct blk_mq_ctx {
 	}  ____cacheline_aligned_in_smp;
 
 	unsigned int		cpu;
-	unsigned int		index_hw;
+	unsigned short		index_hw[HCTX_MAX_TYPES];
 
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 1fd83a91e749..de78e8aa7b0a 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -576,7 +576,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
 {
 	struct kyber_hctx_data *khd = hctx->sched_data;
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
-	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw];
+	struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
 	unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
 	struct list_head *rq_list = &kcq->rq_list[sched_domain];
 	bool merged;
@@ -602,7 +602,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 
 	list_for_each_entry_safe(rq, next, rq_list, queuelist) {
 		unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
-		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw];
+		struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
 		struct list_head *head = &kcq->rq_list[sched_domain];
 
 		spin_lock(&kcq->lock);
@@ -611,7 +611,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
 		else
 			list_move_tail(&rq->queuelist, head);
 		sbitmap_set_bit(&khd->kcq_map[sched_domain],
-				rq->mq_ctx->index_hw);
+				rq->mq_ctx->index_hw[hctx->type]);
 		blk_mq_sched_request_inserted(rq);
 		spin_unlock(&kcq->lock);
 	}
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 176164888628..6c39d546c50b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -37,7 +37,8 @@ struct blk_mq_hw_ctx {
 	struct blk_mq_ctx	*dispatch_from;
 	unsigned int		dispatch_busy;
 
-	unsigned int		nr_ctx;
+	unsigned short		type;
+	unsigned short		nr_ctx;
 	struct blk_mq_ctx	**ctxs;
 
 	spinlock_t		dispatch_wait_lock;

From a783b81820fe3532809c98371ec904dfdb0ea9e5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Oct 2018 08:58:14 -0600
Subject: [PATCH 044/351] blk-mq: add 'type' attribute to the sysfs hctx
 directory

It can be useful for a user to verify what type a given hardware
queue is, expose this information in sysfs.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sysfs.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index aafb44224c89..2d737f9e7ba7 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -161,6 +161,11 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_type_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	return sprintf(page, "%u\n", hctx->type);
+}
+
 static struct attribute *default_ctx_attrs[] = {
 	NULL,
 };
@@ -177,11 +182,16 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.attr = {.name = "cpu_list", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_type = {
+	.attr = {.name = "type", .mode = 0444 },
+	.show = blk_mq_hw_sysfs_type_show,
+};
 
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_nr_tags.attr,
 	&blk_mq_hw_sysfs_nr_reserved_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
+	&blk_mq_hw_sysfs_type.attr,
 	NULL,
 };
 

From b3c661b15d5ab11d982e58bee23e05c1780528a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Oct 2018 10:36:06 -0600
Subject: [PATCH 045/351] blk-mq: support multiple hctx maps

Add support for the tag set carrying multiple queue maps, and
for the driver to inform blk-mq how many it wishes to support
through setting set->nr_maps.

This adds an mq_ops helper for drivers that support more than 1
map, mq_ops->rq_flags_to_type(). The function takes request/bio
flags and CPU, and returns a queue map index for that. We then
use the type information in blk_mq_map_queue() to index the map
set.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 92 ++++++++++++++++++++++++++++--------------
 block/blk-mq.h         | 37 ++++++++++++-----
 include/linux/blk-mq.h | 14 +++++++
 3 files changed, 102 insertions(+), 41 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 31976bff8ad2..2e730c95513f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2258,7 +2258,8 @@ static int blk_mq_init_hctx(struct request_queue *q,
 static void blk_mq_init_cpu_queues(struct request_queue *q,
 				   unsigned int nr_hw_queues)
 {
-	unsigned int i;
+	struct blk_mq_tag_set *set = q->tag_set;
+	unsigned int i, j;
 
 	for_each_possible_cpu(i) {
 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2273,9 +2274,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
 		 */
-		hctx = blk_mq_map_queue_type(q, 0, i);
-		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
-			hctx->numa_node = local_memory_node(cpu_to_node(i));
+		for (j = 0; j < set->nr_maps; j++) {
+			hctx = blk_mq_map_queue_type(q, j, i);
+			if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
+				hctx->numa_node = local_memory_node(cpu_to_node(i));
+		}
 	}
 }
 
@@ -2310,7 +2313,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
-	unsigned int i, hctx_idx;
+	unsigned int i, j, hctx_idx;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -2346,17 +2349,28 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
-		hctx = blk_mq_map_queue_type(q, 0, i);
-		hctx->type = 0;
-		cpumask_set_cpu(i, hctx->cpumask);
-		ctx->index_hw[hctx->type] = hctx->nr_ctx;
-		hctx->ctxs[hctx->nr_ctx++] = ctx;
+		for (j = 0; j < set->nr_maps; j++) {
+			hctx = blk_mq_map_queue_type(q, j, i);
 
-		/*
-		 * If the nr_ctx type overflows, we have exceeded the
-		 * amount of sw queues we can support.
-		 */
-		BUG_ON(!hctx->nr_ctx);
+			/*
+			 * If the CPU is already set in the mask, then we've
+			 * mapped this one already. This can happen if
+			 * devices share queues across queue maps.
+			 */
+			if (cpumask_test_cpu(i, hctx->cpumask))
+				continue;
+
+			cpumask_set_cpu(i, hctx->cpumask);
+			hctx->type = j;
+			ctx->index_hw[hctx->type] = hctx->nr_ctx;
+			hctx->ctxs[hctx->nr_ctx++] = ctx;
+
+			/*
+			 * If the nr_ctx type overflows, we have exceeded the
+			 * amount of sw queues we can support.
+			 */
+			BUG_ON(!hctx->nr_ctx);
+		}
 	}
 
 	mutex_unlock(&q->sysfs_lock);
@@ -2524,6 +2538,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
 	memset(set, 0, sizeof(*set));
 	set->ops = ops;
 	set->nr_hw_queues = 1;
+	set->nr_maps = 1;
 	set->queue_depth = queue_depth;
 	set->numa_node = NUMA_NO_NODE;
 	set->flags = set_flags;
@@ -2800,6 +2815,8 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 {
 	if (set->ops->map_queues) {
+		int i;
+
 		/*
 		 * transport .map_queues is usually done in the following
 		 * way:
@@ -2807,18 +2824,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 		 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
 		 * 	mask = get_cpu_mask(queue)
 		 * 	for_each_cpu(cpu, mask)
-		 * 		set->map.mq_map[cpu] = queue;
+		 * 		set->map[x].mq_map[cpu] = queue;
 		 * }
 		 *
 		 * When we need to remap, the table has to be cleared for
 		 * killing stale mapping since one CPU may not be mapped
 		 * to any hw queue.
 		 */
-		blk_mq_clear_mq_map(&set->map[0]);
+		for (i = 0; i < set->nr_maps; i++)
+			blk_mq_clear_mq_map(&set->map[i]);
 
 		return set->ops->map_queues(set);
-	} else
+	} else {
+		BUG_ON(set->nr_maps > 1);
 		return blk_mq_map_queues(&set->map[0]);
+	}
 }
 
 /*
@@ -2829,7 +2849,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
-	int ret;
+	int i, ret;
 
 	BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
 
@@ -2852,6 +2872,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		set->queue_depth = BLK_MQ_MAX_DEPTH;
 	}
 
+	if (!set->nr_maps)
+		set->nr_maps = 1;
+	else if (set->nr_maps > HCTX_MAX_TYPES)
+		return -EINVAL;
+
 	/*
 	 * If a crashdump is active, then we are potentially in a very
 	 * memory constrained environment. Limit us to 1 queue and
@@ -2873,12 +2898,14 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		return -ENOMEM;
 
 	ret = -ENOMEM;
-	set->map[0].mq_map = kcalloc_node(nr_cpu_ids,
-					  sizeof(*set->map[0].mq_map),
-					  GFP_KERNEL, set->numa_node);
-	if (!set->map[0].mq_map)
-		goto out_free_tags;
-	set->map[0].nr_queues = set->nr_hw_queues;
+	for (i = 0; i < set->nr_maps; i++) {
+		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
+						  sizeof(struct blk_mq_queue_map),
+						  GFP_KERNEL, set->numa_node);
+		if (!set->map[i].mq_map)
+			goto out_free_mq_map;
+		set->map[i].nr_queues = set->nr_hw_queues;
+	}
 
 	ret = blk_mq_update_queue_map(set);
 	if (ret)
@@ -2894,9 +2921,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	return 0;
 
 out_free_mq_map:
-	kfree(set->map[0].mq_map);
-	set->map[0].mq_map = NULL;
-out_free_tags:
+	for (i = 0; i < set->nr_maps; i++) {
+		kfree(set->map[i].mq_map);
+		set->map[i].mq_map = NULL;
+	}
 	kfree(set->tags);
 	set->tags = NULL;
 	return ret;
@@ -2905,13 +2933,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set);
 
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
-	int i;
+	int i, j;
 
 	for (i = 0; i < nr_cpu_ids; i++)
 		blk_mq_free_map_and_requests(set, i);
 
-	kfree(set->map[0].mq_map);
-	set->map[0].mq_map = NULL;
+	for (j = 0; j < set->nr_maps; j++) {
+		kfree(set->map[j].mq_map);
+		set->map[j].mq_map = NULL;
+	}
 
 	kfree(set->tags);
 	set->tags = NULL;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1821f448f7c4..053862270125 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -72,20 +72,37 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
  */
 extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
 
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
-						     unsigned int flags,
-						     unsigned int cpu)
-{
-	struct blk_mq_tag_set *set = q->tag_set;
-
-	return q->queue_hw_ctx[set->map[0].mq_map[cpu]];
-}
-
+/*
+ * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
+ * @q: request queue
+ * @hctx_type: the hctx type index
+ * @cpu: CPU
+ */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
 							  unsigned int hctx_type,
 							  unsigned int cpu)
 {
-	return blk_mq_map_queue(q, hctx_type, cpu);
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	return q->queue_hw_ctx[set->map[hctx_type].mq_map[cpu]];
+}
+
+/*
+ * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
+ * @q: request queue
+ * @flags: request command flags
+ * @cpu: CPU
+ */
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+						     unsigned int flags,
+						     unsigned int cpu)
+{
+	int hctx_type = 0;
+
+	if (q->mq_ops->rq_flags_to_type)
+		hctx_type = q->mq_ops->rq_flags_to_type(q, flags);
+
+	return blk_mq_map_queue_type(q, hctx_type, cpu);
 }
 
 /*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6c39d546c50b..8994c95056a8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -85,7 +85,14 @@ enum {
 };
 
 struct blk_mq_tag_set {
+	/*
+	 * map[] holds ctx -> hctx mappings, one map exists for each type
+	 * that the driver wishes to support. There are no restrictions
+	 * on maps being of the same size, and it's perfectly legal to
+	 * share maps between types.
+	 */
 	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
+	unsigned int		nr_maps;	/* nr entries in map[] */
 	const struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
 	unsigned int		queue_depth;	/* max hw supported */
@@ -109,6 +116,8 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+/* takes rq->cmd_flags as input, returns a hardware type index */
+typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
@@ -134,6 +143,11 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * Return a queue map type for the given request/bio flags
+	 */
+	rq_flags_to_type_fn	*rq_flags_to_type;
+
 	/*
 	 * Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the

From 392546aed22009060911f76b6ea24520e2f8b50f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 13:25:27 -0600
Subject: [PATCH 046/351] blk-mq: separate number of hardware queues from
 nr_cpu_ids

With multiple maps, nr_cpu_ids is no longer the maximum number of
hardware queues we support on a given devices. The initializer of
the tag_set can have set ->nr_hw_queues larger than the available
number of CPUs, since we can exceed that with multiple queue maps.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2e730c95513f..ccf135cf41b0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2669,6 +2669,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 	mutex_unlock(&q->sysfs_lock);
 }
 
+/*
+ * Maximum number of hardware queues we support. For single sets, we'll never
+ * have more than the CPUs (software queues). For multiple sets, the tag_set
+ * user may have set ->nr_hw_queues larger.
+ */
+static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
+{
+	if (set->nr_maps == 1)
+		return nr_cpu_ids;
+
+	return max(set->nr_hw_queues, nr_cpu_ids);
+}
+
 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 						  struct request_queue *q)
 {
@@ -2688,7 +2701,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	/* init q->mq_kobj and sw queues' kobjects */
 	blk_mq_sysfs_init(q);
 
-	q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)),
+	q->nr_queues = nr_hw_queues(set);
+	q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
 						GFP_KERNEL, set->numa_node);
 	if (!q->queue_hw_ctx)
 		goto err_percpu;
@@ -2700,7 +2714,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
 	blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
 
-	q->nr_queues = nr_cpu_ids;
 	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
@@ -2887,12 +2900,13 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 		set->queue_depth = min(64U, set->queue_depth);
 	}
 	/*
-	 * There is no use for more h/w queues than cpus.
+	 * There is no use for more h/w queues than cpus if we just have
+	 * a single map
 	 */
-	if (set->nr_hw_queues > nr_cpu_ids)
+	if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
 		set->nr_hw_queues = nr_cpu_ids;
 
-	set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *),
+	set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
 		return -ENOMEM;
@@ -2935,7 +2949,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i, j;
 
-	for (i = 0; i < nr_cpu_ids; i++)
+	for (i = 0; i < nr_hw_queues(set); i++)
 		blk_mq_free_map_and_requests(set, i);
 
 	for (j = 0; j < set->nr_maps; j++) {
@@ -3067,7 +3081,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 
 	lockdep_assert_held(&set->tag_list_lock);
 
-	if (nr_hw_queues > nr_cpu_ids)
+	if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
 		nr_hw_queues = nr_cpu_ids;
 	if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
 		return;

From ea4f995ee8b8f0578b3319949f2edd5d812fdb0a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 29 Oct 2018 15:06:13 -0600
Subject: [PATCH 047/351] blk-mq: cache request hardware queue mapping

We call blk_mq_map_queue() a lot, at least two times for each
request per IO, sometimes more. Since we now have an indirect
call as well in that function. cache the mapping so we don't
have to re-call blk_mq_map_queue() for the same request
multiple times.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c      | 12 ++++--------
 block/blk-mq-debugfs.c |  4 +---
 block/blk-mq-sched.c   |  6 ++----
 block/blk-mq-tag.c     |  9 +--------
 block/blk-mq.c         | 22 +++++++++-------------
 block/blk-mq.h         |  5 +----
 include/linux/blkdev.h |  1 +
 7 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 77e9f5b2ee05..c53197dcdd70 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -215,7 +215,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
-	hctx = blk_mq_map_queue(q, flush_rq->cmd_flags, flush_rq->mq_ctx->cpu);
+	hctx = flush_rq->mq_hctx;
 	if (!q->elevator) {
 		blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
 		flush_rq->tag = -1;
@@ -262,7 +262,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	struct request *first_rq =
 		list_first_entry(pending, struct request, flush.list);
 	struct request *flush_rq = fq->flush_rq;
-	struct blk_mq_hw_ctx *hctx;
 
 	/* C1 described at the top of this file */
 	if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
@@ -297,13 +296,12 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * just for cheating put/get driver tag.
 	 */
 	flush_rq->mq_ctx = first_rq->mq_ctx;
+	flush_rq->mq_hctx = first_rq->mq_hctx;
 
 	if (!q->elevator) {
 		fq->orig_rq = first_rq;
 		flush_rq->tag = first_rq->tag;
-		hctx = blk_mq_map_queue(q, first_rq->cmd_flags,
-					first_rq->mq_ctx->cpu);
-		blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+		blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
 	} else {
 		flush_rq->internal_tag = first_rq->internal_tag;
 	}
@@ -320,13 +318,11 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
 {
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	unsigned long flags;
 	struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
 
-	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
-
 	if (q->elevator) {
 		WARN_ON(rq->tag < 0);
 		blk_mq_put_driver_tag_hctx(hctx, rq);
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index fac70c81b7de..cde19be36135 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -427,10 +427,8 @@ struct show_busy_params {
 static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
 {
 	const struct show_busy_params *params = data;
-	struct blk_mq_hw_ctx *hctx;
 
-	hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
-	if (hctx == params->hctx)
+	if (rq->mq_hctx == params->hctx)
 		__blk_mq_debugfs_rq_show(params->m,
 					 list_entry_rq(&rq->queuelist));
 }
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index bbabc3877d5a..641df3f00632 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -366,9 +366,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx;
-
-	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
 	/* flush rq in flush machinery need to be dispatched directly */
 	if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
@@ -407,7 +405,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
 
 	/* For list inserts, requests better be on the same hw queue */
 	rq = list_first_entry(list, struct request, queuelist);
-	hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
+	hctx = rq->mq_hctx;
 
 	e = hctx->queue->elevator;
 	if (e && e->type->ops.insert_requests)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 478a959357f5..fb836d818b80 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -527,14 +527,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
  */
 u32 blk_mq_unique_tag(struct request *rq)
 {
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx;
-	int hwq = 0;
-
-	hctx = blk_mq_map_queue(q, rq->cmd_flags, rq->mq_ctx->cpu);
-	hwq = hctx->queue_num;
-
-	return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
+	return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
 		(rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
 }
 EXPORT_SYMBOL(blk_mq_unique_tag);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ccf135cf41b0..6b2859d3ad23 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -300,6 +300,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = data->q;
 	rq->mq_ctx = data->ctx;
+	rq->mq_hctx = data->hctx;
 	rq->rq_flags = rq_flags;
 	rq->cmd_flags = op;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
@@ -472,10 +473,11 @@ static void __blk_mq_free_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	const int sched_tag = rq->internal_tag;
 
 	blk_pm_mark_last_busy(rq);
+	rq->mq_hctx = NULL;
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -489,7 +491,7 @@ void blk_mq_free_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
 	if (rq->rq_flags & RQF_ELVPRIV) {
 		if (e && e->type->ops.finish_request)
@@ -983,7 +985,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
 {
 	struct blk_mq_alloc_data data = {
 		.q = rq->q,
-		.hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu),
+		.hctx = rq->mq_hctx,
 		.flags = BLK_MQ_REQ_NOWAIT,
 		.cmd_flags = rq->cmd_flags,
 	};
@@ -1149,7 +1151,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 
 		rq = list_first_entry(list, struct request, queuelist);
 
-		hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
+		hctx = rq->mq_hctx;
 		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
 			break;
 
@@ -1579,9 +1581,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
  */
 void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
 {
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
-							ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
 	spin_lock(&hctx->lock);
 	list_add_tail(&rq->queuelist, &hctx->dispatch);
@@ -1790,9 +1790,7 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
 	blk_status_t ret;
 	int srcu_idx;
 	blk_qc_t unused_cookie;
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
-							ctx->cpu);
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
 	hctx_lock(hctx, &srcu_idx);
 	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
@@ -1917,9 +1915,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_mq_put_ctx(data.ctx);
 
 		if (same_queue_rq) {
-			data.hctx = blk_mq_map_queue(q,
-					same_queue_rq->cmd_flags,
-					same_queue_rq->mq_ctx->cpu);
+			data.hctx = same_queue_rq->mq_hctx;
 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
 					&cookie);
 		}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 053862270125..facb6e9ddce4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -223,13 +223,10 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
 
 static inline void blk_mq_put_driver_tag(struct request *rq)
 {
-	struct blk_mq_hw_ctx *hctx;
-
 	if (rq->tag == -1 || rq->internal_tag == -1)
 		return;
 
-	hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
-	__blk_mq_put_driver_tag(hctx, rq);
+	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
 }
 
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2ae7465d68ab..9b1f470cc784 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -129,6 +129,7 @@ enum mq_rq_state {
 struct request {
 	struct request_queue *q;
 	struct blk_mq_ctx *mq_ctx;
+	struct blk_mq_hw_ctx *mq_hctx;
 
 	unsigned int cmd_flags;		/* op and common flags */
 	req_flags_t rq_flags;

From 67cae4c948a5311121905a2a8740c50daf7f6478 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Oct 2018 11:31:51 -0600
Subject: [PATCH 048/351] blk-mq: cleanup and improve list insertion

It's somewhat strange to have a list insertion function that
relies on the fact that the caller has mapped things correctly.
Pass in the hardware queue directly for insertion, which makes
for a much cleaner interface and implementation.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c |  8 +-------
 block/blk-mq-sched.h |  2 +-
 block/blk-mq.c       | 25 ++++++++++++++-----------
 3 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 641df3f00632..66fda19be5a3 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -395,17 +395,11 @@ run:
 		blk_mq_run_hw_queue(hctx, async);
 }
 
-void blk_mq_sched_insert_requests(struct request_queue *q,
+void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async)
 {
-	struct blk_mq_hw_ctx *hctx;
 	struct elevator_queue *e;
-	struct request *rq;
-
-	/* For list inserts, requests better be on the same hw queue */
-	rq = list_first_entry(list, struct request, queuelist);
-	hctx = rq->mq_hctx;
 
 	e = hctx->queue->elevator;
 	if (e && e->type->ops.insert_requests)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 947f236b273d..7ff5671bf128 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -19,7 +19,7 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async);
-void blk_mq_sched_insert_requests(struct request_queue *q,
+void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 				  struct blk_mq_ctx *ctx,
 				  struct list_head *list, bool run_queue_async);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6b2859d3ad23..271726d48003 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1624,11 +1624,12 @@ static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
+	struct blk_mq_hw_ctx *this_hctx;
 	struct blk_mq_ctx *this_ctx;
 	struct request_queue *this_q;
 	struct request *rq;
 	LIST_HEAD(list);
-	LIST_HEAD(ctx_list);
+	LIST_HEAD(rq_list);
 	unsigned int depth;
 
 	list_splice_init(&plug->mq_list, &list);
@@ -1636,6 +1637,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	list_sort(NULL, &list, plug_ctx_cmp);
 
 	this_q = NULL;
+	this_hctx = NULL;
 	this_ctx = NULL;
 	depth = 0;
 
@@ -1643,30 +1645,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
 		BUG_ON(!rq->q);
-		if (rq->mq_ctx != this_ctx) {
-			if (this_ctx) {
+		if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
+			if (this_hctx) {
 				trace_block_unplug(this_q, depth, !from_schedule);
-				blk_mq_sched_insert_requests(this_q, this_ctx,
-								&ctx_list,
+				blk_mq_sched_insert_requests(this_hctx, this_ctx,
+								&rq_list,
 								from_schedule);
 			}
 
-			this_ctx = rq->mq_ctx;
 			this_q = rq->q;
+			this_ctx = rq->mq_ctx;
+			this_hctx = rq->mq_hctx;
 			depth = 0;
 		}
 
 		depth++;
-		list_add_tail(&rq->queuelist, &ctx_list);
+		list_add_tail(&rq->queuelist, &rq_list);
 	}
 
 	/*
-	 * If 'this_ctx' is set, we know we have entries to complete
-	 * on 'ctx_list'. Do those.
+	 * If 'this_hctx' is set, we know we have entries to complete
+	 * on 'rq_list'. Do those.
 	 */
-	if (this_ctx) {
+	if (this_hctx) {
 		trace_block_unplug(this_q, depth, !from_schedule);
-		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+		blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
 						from_schedule);
 	}
 }

From 3110fc79606fb6003949246c6fb325dd43445273 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Oct 2018 12:24:04 -0600
Subject: [PATCH 049/351] blk-mq: improve plug list sorting

Currently we only look at the software queue, but with support
for multiple maps, we should also look at the hardware queue.
This is important since we'll flush out the request list if
either the software queue or hardware queue don't match.

This sorts by software queue first, then hardware queue if
that differs. Finally we sort by request location like before.
This minimizes the flush points per plug list.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 271726d48003..45c92b8d4795 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1612,14 +1612,21 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	spin_unlock(&ctx->lock);
 }
 
-static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
 
-	return !(rqa->mq_ctx < rqb->mq_ctx ||
-		 (rqa->mq_ctx == rqb->mq_ctx &&
-		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
+	if (rqa->mq_ctx < rqb->mq_ctx)
+		return -1;
+	else if (rqa->mq_ctx > rqb->mq_ctx)
+		return 1;
+	else if (rqa->mq_hctx < rqb->mq_hctx)
+		return -1;
+	else if (rqa->mq_hctx > rqb->mq_hctx)
+		return 1;
+
+	return blk_rq_pos(rqa) > blk_rq_pos(rqb);
 }
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
@@ -1634,7 +1641,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 	list_splice_init(&plug->mq_list, &list);
 
-	list_sort(NULL, &list, plug_ctx_cmp);
+	list_sort(NULL, &list, plug_rq_cmp);
 
 	this_q = NULL;
 	this_hctx = NULL;

From 843477d4cc5c4bb4e346c561ecd3b9d0bd67e8c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 24 Oct 2018 13:16:11 -0600
Subject: [PATCH 050/351] blk-mq: initial support for multiple queue maps

Add a queue offset to the tag map. This enables users to map
iteratively, for each queue map type they support.

Bump maximum number of supported maps to 2, we're now fully
able to support more than 1 map.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c  | 9 +++++----
 block/blk-mq-pci.c     | 2 +-
 block/blk-mq-virtio.c  | 2 +-
 include/linux/blk-mq.h | 3 ++-
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 6e6686c55984..03a534820271 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -14,9 +14,10 @@
 #include "blk.h"
 #include "blk-mq.h"
 
-static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
+static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
+			      unsigned int nr_queues, const int cpu)
 {
-	return cpu % nr_queues;
+	return qmap->queue_offset + (cpu % nr_queues);
 }
 
 static int get_first_sibling(unsigned int cpu)
@@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
 		 * performace optimizations.
 		 */
 		if (cpu < nr_queues) {
-			map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+			map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
 		} else {
 			first_sibling = get_first_sibling(cpu);
 			if (first_sibling == cpu)
-				map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+				map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
 			else
 				map[cpu] = map[first_sibling];
 		}
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index 40333d60a850..1dce18553984 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -43,7 +43,7 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			qmap->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 
 	return 0;
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index 661fbfef480f..370827163835 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -44,7 +44,7 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			qmap->mq_map[cpu] = queue;
+			qmap->mq_map[cpu] = qmap->queue_offset + queue;
 	}
 
 	return 0;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8994c95056a8..729ce0f00433 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -78,10 +78,11 @@ struct blk_mq_hw_ctx {
 struct blk_mq_queue_map {
 	unsigned int *mq_map;
 	unsigned int nr_queues;
+	unsigned int queue_offset;
 };
 
 enum {
-	HCTX_MAX_TYPES = 1,
+	HCTX_MAX_TYPES = 2,
 };
 
 struct blk_mq_tag_set {

From 3b6592f70ad7b4c24dd3eb2ac9bbe3353d02c992 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 31 Oct 2018 08:36:31 -0600
Subject: [PATCH 051/351] nvme: utilize two queue maps, one for reads and one
 for writes

NVMe does round-robin between queues by default, which means that
sharing a queue map for both reads and writes can be problematic
in terms of read servicing. It's much easier to flood the queue
with writes and reduce the read servicing.

Implement two queue maps, one for reads and one for writes. The
write queue count is configurable through the 'write_queues'
parameter.

By default, we retain the previous behavior of having a single
queue set, shared between reads and writes. Setting 'write_queues'
to a non-zero value will create two queue sets, one for reads and
one for writes, the latter using the configurable number of
queues (hardware queue counts permitting).

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 200 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 181 insertions(+), 19 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 49ad854d1b91..1987df13b73e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -74,11 +74,29 @@ static int io_queue_depth = 1024;
 module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
 MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");
 
+static int queue_count_set(const char *val, const struct kernel_param *kp);
+static const struct kernel_param_ops queue_count_ops = {
+	.set = queue_count_set,
+	.get = param_get_int,
+};
+
+static int write_queues;
+module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
+MODULE_PARM_DESC(write_queues,
+	"Number of queues to use for writes. If not set, reads and writes "
+	"will share a queue set.");
+
 struct nvme_dev;
 struct nvme_queue;
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
+enum {
+	NVMEQ_TYPE_READ,
+	NVMEQ_TYPE_WRITE,
+	NVMEQ_TYPE_NR,
+};
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -92,6 +110,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
+	unsigned io_queues[NVMEQ_TYPE_NR];
 	unsigned int num_vecs;
 	int q_depth;
 	u32 db_stride;
@@ -134,6 +153,17 @@ static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
 	return param_set_int(val, kp);
 }
 
+static int queue_count_set(const char *val, const struct kernel_param *kp)
+{
+	int n = 0, ret;
+
+	ret = kstrtoint(val, 10, &n);
+	if (n > num_possible_cpus())
+		n = num_possible_cpus();
+
+	return param_set_int(val, kp);
+}
+
 static inline unsigned int sq_idx(unsigned int qid, u32 stride)
 {
 	return qid * 2 * stride;
@@ -218,9 +248,20 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
 }
 
+static unsigned int max_io_queues(void)
+{
+	return num_possible_cpus() + write_queues;
+}
+
+static unsigned int max_queue_count(void)
+{
+	/* IO queues + admin queue */
+	return 1 + max_io_queues();
+}
+
 static inline unsigned int nvme_dbbuf_size(u32 stride)
 {
-	return ((num_possible_cpus() + 1) * 8 * stride);
+	return (max_queue_count() * 8 * stride);
 }
 
 static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
@@ -431,12 +472,41 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
 	return 0;
 }
 
+static int queue_irq_offset(struct nvme_dev *dev)
+{
+	/* if we have more than 1 vec, admin queue offsets us by 1 */
+	if (dev->num_vecs > 1)
+		return 1;
+
+	return 0;
+}
+
 static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_dev *dev = set->driver_data;
+	int i, qoff, offset;
 
-	return blk_mq_pci_map_queues(&set->map[0], to_pci_dev(dev->dev),
-			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
+	offset = queue_irq_offset(dev);
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = dev->io_queues[i];
+		if (!map->nr_queues) {
+			BUG_ON(i == NVMEQ_TYPE_READ);
+
+			/* shared set, resuse read set parameters */
+			map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+			qoff = 0;
+			offset = queue_irq_offset(dev);
+		}
+
+		map->queue_offset = qoff;
+		blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		qoff += map->nr_queues;
+		offset += map->nr_queues;
+	}
+
+	return 0;
 }
 
 /**
@@ -849,6 +919,14 @@ out_free_cmd:
 	return ret;
 }
 
+static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
+{
+	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+		return NVMEQ_TYPE_READ;
+
+	return NVMEQ_TYPE_WRITE;
+}
+
 static void nvme_pci_complete_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1475,13 +1553,14 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 };
 
 static const struct blk_mq_ops nvme_mq_ops = {
-	.queue_rq	= nvme_queue_rq,
-	.complete	= nvme_pci_complete_rq,
-	.init_hctx	= nvme_init_hctx,
-	.init_request	= nvme_init_request,
-	.map_queues	= nvme_pci_map_queues,
-	.timeout	= nvme_timeout,
-	.poll		= nvme_poll,
+	.queue_rq		= nvme_queue_rq,
+	.rq_flags_to_type	= nvme_rq_flags_to_type,
+	.complete		= nvme_pci_complete_rq,
+	.init_hctx		= nvme_init_hctx,
+	.init_request		= nvme_init_request,
+	.map_queues		= nvme_pci_map_queues,
+	.timeout		= nvme_timeout,
+	.poll			= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -1891,6 +1970,87 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
+{
+	unsigned int this_w_queues = write_queues;
+
+	/*
+	 * Setup read/write queue split
+	 */
+	if (nr_io_queues == 1) {
+		dev->io_queues[NVMEQ_TYPE_READ] = 1;
+		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		return;
+	}
+
+	/*
+	 * If 'write_queues' is set, ensure it leaves room for at least
+	 * one read queue
+	 */
+	if (this_w_queues >= nr_io_queues)
+		this_w_queues = nr_io_queues - 1;
+
+	/*
+	 * If 'write_queues' is set to zero, reads and writes will share
+	 * a queue set.
+	 */
+	if (!this_w_queues) {
+		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
+	} else {
+		dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
+		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
+	}
+}
+
+static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
+{
+	struct pci_dev *pdev = to_pci_dev(dev->dev);
+	int irq_sets[2];
+	struct irq_affinity affd = {
+		.pre_vectors = 1,
+		.nr_sets = ARRAY_SIZE(irq_sets),
+		.sets = irq_sets,
+	};
+	int result;
+
+	/*
+	 * For irq sets, we have to ask for minvec == maxvec. This passes
+	 * any reduction back to us, so we can adjust our queue counts and
+	 * IRQ vector needs.
+	 */
+	do {
+		nvme_calc_io_queues(dev, nr_io_queues);
+		irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
+		irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
+		if (!irq_sets[1])
+			affd.nr_sets = 1;
+
+		/*
+		 * Need IRQs for read+write queues, and one for the admin queue
+		 */
+		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+
+		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
+				nr_io_queues,
+				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+		/*
+		 * Need to reduce our vec counts
+		 */
+		if (result == -ENOSPC) {
+			nr_io_queues--;
+			if (!nr_io_queues)
+				return result;
+			continue;
+		} else if (result <= 0)
+			return -EIO;
+		break;
+	} while (1);
+
+	return result;
+}
+
 static int nvme_setup_io_queues(struct nvme_dev *dev)
 {
 	struct nvme_queue *adminq = &dev->queues[0];
@@ -1898,11 +2058,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	int result, nr_io_queues;
 	unsigned long size;
 
-	struct irq_affinity affd = {
-		.pre_vectors = 1
-	};
-
-	nr_io_queues = num_possible_cpus();
+	nr_io_queues = max_io_queues();
 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
 	if (result < 0)
 		return result;
@@ -1937,13 +2093,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * setting up the full range we need.
 	 */
 	pci_free_irq_vectors(pdev);
-	result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
-			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+	result = nvme_setup_irqs(dev, nr_io_queues);
 	if (result <= 0)
 		return -EIO;
+
 	dev->num_vecs = result;
 	dev->max_qid = max(result - 1, 1);
 
+	dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
+					dev->io_queues[NVMEQ_TYPE_READ],
+					dev->io_queues[NVMEQ_TYPE_WRITE]);
+
 	/*
 	 * Should investigate if there's a performance win from allocating
 	 * more queues than interrupt vectors; it might allow the submission
@@ -2045,6 +2206,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	if (!dev->ctrl.tagset) {
 		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
+		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
@@ -2491,8 +2653,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (!dev)
 		return -ENOMEM;
 
-	dev->queues = kcalloc_node(num_possible_cpus() + 1,
-			sizeof(struct nvme_queue), GFP_KERNEL, node);
+	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
+					GFP_KERNEL, node);
 	if (!dev->queues)
 		goto free;
 

From d1e36282b0bbd5de6a9c4d5275e94ef3b3438f48 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 29 Aug 2018 10:36:56 -0600
Subject: [PATCH 052/351] block: add REQ_HIPRI and inherit it from IOCB_HIPRI

We use IOCB_HIPRI to poll for IO in the caller instead of scheduling.
This information is not available for (or after) IO submission. The
driver may make different queue choices based on the type of IO, so
make the fact that we will poll for this IO known to the lower layers
as well.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c            | 2 ++
 fs/direct-io.c            | 2 ++
 fs/iomap.c                | 9 ++++++++-
 include/linux/blk_types.h | 4 +++-
 4 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a80b4f0ee7c4..c039abfb2052 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -232,6 +232,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 		bio.bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(ret);
 	}
+	if (iocb->ki_flags & IOCB_HIPRI)
+		bio.bi_opf |= REQ_HIPRI;
 
 	qc = submit_bio(&bio);
 	for (;;) {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 722d17c88edb..ea07d5a34317 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1265,6 +1265,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	} else {
 		dio->op = REQ_OP_READ;
 	}
+	if (iocb->ki_flags & IOCB_HIPRI)
+		dio->op_flags |= REQ_HIPRI;
 
 	/*
 	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
diff --git a/fs/iomap.c b/fs/iomap.c
index 64ce240217a1..f61d13dfdf09 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1553,6 +1553,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 		unsigned len)
 {
 	struct page *page = ZERO_PAGE(0);
+	int flags = REQ_SYNC | REQ_IDLE;
 	struct bio *bio;
 
 	bio = bio_alloc(GFP_KERNEL, 1);
@@ -1561,9 +1562,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
 	bio->bi_private = dio;
 	bio->bi_end_io = iomap_dio_bio_end_io;
 
+	if (dio->iocb->ki_flags & IOCB_HIPRI)
+		flags |= REQ_HIPRI;
+
 	get_page(page);
 	__bio_add_page(bio, page, len, 0);
-	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+	bio_set_op_attrs(bio, REQ_OP_WRITE, flags);
 
 	atomic_inc(&dio->ref);
 	return submit_bio(bio);
@@ -1662,6 +1666,9 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
 				bio_set_pages_dirty(bio);
 		}
 
+		if (dio->iocb->ki_flags & IOCB_HIPRI)
+			bio->bi_opf |= REQ_HIPRI;
+
 		iov_iter_advance(dio->submit.iter, n);
 
 		dio->size += n;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1dcf652ba0aa..dbdbfbd6a987 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -323,6 +323,8 @@ enum req_flag_bits {
 	/* command specific flags for REQ_OP_WRITE_ZEROES: */
 	__REQ_NOUNMAP,		/* do not free blocks when zeroing */
 
+	__REQ_HIPRI,
+
 	/* for driver use */
 	__REQ_DRV,
 	__REQ_SWAP,		/* swapping request. */
@@ -343,8 +345,8 @@ enum req_flag_bits {
 #define REQ_RAHEAD		(1ULL << __REQ_RAHEAD)
 #define REQ_BACKGROUND		(1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT		(1ULL << __REQ_NOWAIT)
-
 #define REQ_NOUNMAP		(1ULL << __REQ_NOUNMAP)
+#define REQ_HIPRI		(1ULL << __REQ_HIPRI)
 
 #define REQ_DRV			(1ULL << __REQ_DRV)
 #define REQ_SWAP		(1ULL << __REQ_SWAP)

From 4b04cc6a8f86c4842314def22332de1f15de8523 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 5 Nov 2018 12:44:33 -0700
Subject: [PATCH 053/351] nvme: add separate poll queue map

Adds support for defining a variable number of poll queues, currently
configurable with the 'poll_queues' module parameter. Defaults to
a single poll queue.

And now we finally have poll support without triggering interrupts!

Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 97 +++++++++++++++++++++++++++++++++--------
 include/linux/blk-mq.h  |  2 +-
 2 files changed, 81 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 1987df13b73e..6aa86dfcb32c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,6 +86,10 @@ MODULE_PARM_DESC(write_queues,
 	"Number of queues to use for writes. If not set, reads and writes "
 	"will share a queue set.");
 
+static int poll_queues = 1;
+module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
+MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
+
 struct nvme_dev;
 struct nvme_queue;
 
@@ -94,6 +98,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 enum {
 	NVMEQ_TYPE_READ,
 	NVMEQ_TYPE_WRITE,
+	NVMEQ_TYPE_POLL,
 	NVMEQ_TYPE_NR,
 };
 
@@ -202,6 +207,7 @@ struct nvme_queue {
 	u16 last_cq_head;
 	u16 qid;
 	u8 cq_phase;
+	u8 polled;
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -250,7 +256,7 @@ static inline void _nvme_check_size(void)
 
 static unsigned int max_io_queues(void)
 {
-	return num_possible_cpus() + write_queues;
+	return num_possible_cpus() + write_queues + poll_queues;
 }
 
 static unsigned int max_queue_count(void)
@@ -500,8 +506,15 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 			offset = queue_irq_offset(dev);
 		}
 
+		/*
+		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
+		 * affinity), so use the regular blk-mq cpu mapping
+		 */
 		map->queue_offset = qoff;
-		blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		if (i != NVMEQ_TYPE_POLL)
+			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
+		else
+			blk_mq_map_queues(map);
 		qoff += map->nr_queues;
 		offset += map->nr_queues;
 	}
@@ -892,7 +905,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
-	if (unlikely(nvmeq->cq_vector < 0))
+	if (unlikely(nvmeq->cq_vector < 0 && !nvmeq->polled))
 		return BLK_STS_IOERR;
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -921,6 +934,8 @@ out_free_cmd:
 
 static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
 {
+	if ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return NVMEQ_TYPE_POLL;
 	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		return NVMEQ_TYPE_READ;
 
@@ -1094,7 +1109,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 		struct nvme_queue *nvmeq, s16 vector)
 {
 	struct nvme_command c;
-	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+	int flags = NVME_QUEUE_PHYS_CONTIG;
+
+	if (vector != -1)
+		flags |= NVME_CQ_IRQ_ENABLED;
 
 	/*
 	 * Note: we (ab)use the fact that the prp fields survive if no data
@@ -1106,7 +1124,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cqid = cpu_to_le16(qid);
 	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
 	c.create_cq.cq_flags = cpu_to_le16(flags);
-	c.create_cq.irq_vector = cpu_to_le16(vector);
+	if (vector != -1)
+		c.create_cq.irq_vector = cpu_to_le16(vector);
+	else
+		c.create_cq.irq_vector = 0;
 
 	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
 }
@@ -1348,13 +1369,14 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	int vector;
 
 	spin_lock_irq(&nvmeq->cq_lock);
-	if (nvmeq->cq_vector == -1) {
+	if (nvmeq->cq_vector == -1 && !nvmeq->polled) {
 		spin_unlock_irq(&nvmeq->cq_lock);
 		return 1;
 	}
 	vector = nvmeq->cq_vector;
 	nvmeq->dev->online_queues--;
 	nvmeq->cq_vector = -1;
+	nvmeq->polled = false;
 	spin_unlock_irq(&nvmeq->cq_lock);
 
 	/*
@@ -1366,7 +1388,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
 
-	pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
+	if (vector != -1)
+		pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
 
 	return 0;
 }
@@ -1500,7 +1523,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	spin_unlock_irq(&nvmeq->cq_lock);
 }
 
-static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 	int result;
@@ -1510,7 +1533,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
 	 */
-	vector = dev->num_vecs == 1 ? 0 : qid;
+	if (!polled)
+		vector = dev->num_vecs == 1 ? 0 : qid;
+	else
+		vector = -1;
+
 	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
 	if (result)
 		return result;
@@ -1527,15 +1554,20 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 	 * xxx' warning if the create CQ/SQ command times out.
 	 */
 	nvmeq->cq_vector = vector;
+	nvmeq->polled = polled;
 	nvme_init_queue(nvmeq, qid);
-	result = queue_request_irq(nvmeq);
-	if (result < 0)
-		goto release_sq;
+
+	if (vector != -1) {
+		result = queue_request_irq(nvmeq);
+		if (result < 0)
+			goto release_sq;
+	}
 
 	return result;
 
 release_sq:
 	nvmeq->cq_vector = -1;
+	nvmeq->polled = false;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
 release_cq:
@@ -1686,7 +1718,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 
 static int nvme_create_io_queues(struct nvme_dev *dev)
 {
-	unsigned i, max;
+	unsigned i, max, rw_queues;
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
@@ -1697,8 +1729,17 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
+	if (max != 1 && dev->io_queues[NVMEQ_TYPE_POLL]) {
+		rw_queues = dev->io_queues[NVMEQ_TYPE_READ] +
+				dev->io_queues[NVMEQ_TYPE_WRITE];
+	} else {
+		rw_queues = max;
+	}
+
 	for (i = dev->online_queues; i <= max; i++) {
-		ret = nvme_create_queue(&dev->queues[i], i);
+		bool polled = i > rw_queues;
+
+		ret = nvme_create_queue(&dev->queues[i], i, polled);
 		if (ret)
 			break;
 	}
@@ -1973,6 +2014,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	unsigned int this_w_queues = write_queues;
+	unsigned int this_p_queues = poll_queues;
 
 	/*
 	 * Setup read/write queue split
@@ -1980,9 +2022,28 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	if (nr_io_queues == 1) {
 		dev->io_queues[NVMEQ_TYPE_READ] = 1;
 		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
+		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
 		return;
 	}
 
+	/*
+	 * Configure number of poll queues, if set
+	 */
+	if (this_p_queues) {
+		/*
+		 * We need at least one queue left. With just one queue, we'll
+		 * have a single shared read/write set.
+		 */
+		if (this_p_queues >= nr_io_queues) {
+			this_w_queues = 0;
+			this_p_queues = nr_io_queues - 1;
+		}
+
+		dev->io_queues[NVMEQ_TYPE_POLL] = this_p_queues;
+		nr_io_queues -= this_p_queues;
+	} else
+		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
 	 * one read queue
@@ -2099,11 +2160,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		return -EIO;
 
 	dev->num_vecs = result;
-	dev->max_qid = max(result - 1, 1);
+	result = max(result - 1, 1);
+	dev->max_qid = result + dev->io_queues[NVMEQ_TYPE_POLL];
 
-	dev_info(dev->ctrl.device, "%d/%d read/write queues\n",
+	dev_info(dev->ctrl.device, "%d/%d/%d read/write/poll queues\n",
 					dev->io_queues[NVMEQ_TYPE_READ],
-					dev->io_queues[NVMEQ_TYPE_WRITE]);
+					dev->io_queues[NVMEQ_TYPE_WRITE],
+					dev->io_queues[NVMEQ_TYPE_POLL]);
 
 	/*
 	 * Should investigate if there's a performance win from allocating
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 729ce0f00433..9f5e93f40857 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -82,7 +82,7 @@ struct blk_mq_queue_map {
 };
 
 enum {
-	HCTX_MAX_TYPES = 2,
+	HCTX_MAX_TYPES = 3,
 };
 
 struct blk_mq_tag_set {

From dbef5257737b989d4785b62939f1e9cba3948689 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Nov 2018 21:17:57 -0700
Subject: [PATCH 054/351] sunvdc: fix compiler warning

Stephen reports:

After merging the block tree, today's linux-next build (sparc64 defconfig)
produced this warning:

/home/sfr/next/next/drivers/block/sunvdc.c: In function 'init_queue':
/home/sfr/next/next/drivers/block/sunvdc.c:788:6: warning: unused variable 'ret' [-Wunused-variable]
  int ret;
      ^~~

Kill the unused variable.

Fixes: fa182a1fa97d ("sunvdc: convert to blk-mq")
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 95cb4ea8e402..175a64619602 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -785,7 +785,6 @@ static void cleanup_queue(struct request_queue *q)
 static struct request_queue *init_queue(struct vdc_port *port)
 {
 	struct request_queue *q;
-	int ret;
 
 	q = blk_mq_init_sq_queue(&port->tag_set, &vdc_mq_ops, VDC_TX_RING_SIZE,
 					BLK_MQ_F_SHOULD_MERGE);

From e051bd0ddfdd6696197ad8c12d7485fc68c39f7b Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Thu, 8 Nov 2018 11:08:09 +0000
Subject: [PATCH 055/351] ms_block: remove unused pointer 'set'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pointer 'set' is declared but not used, remove it. Cleans up warning:

warning: unused variable ‘set’ [-Wunused-variable]

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/memstick/core/ms_block.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index ee0be66a9a03..82daccc9ea62 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2104,7 +2104,6 @@ static const struct blk_mq_ops msb_mq_ops = {
 static int msb_init_disk(struct memstick_dev *card)
 {
 	struct msb_data *msb = memstick_get_drvdata(card);
-	struct blk_mq_tag_set *set = NULL;
 	int rc;
 	unsigned long capacity;
 

From b1ab5fa309e6c49e4e06270ec67dd7b3e9971d04 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 8 Nov 2018 14:01:01 +0100
Subject: [PATCH 056/351] block/loop: Don't grab "struct file" for
 vfs_getattr() operation.

vfs_getattr() needs "struct path" rather than "struct file".
Let's use path_get()/path_put() rather than get_file()/fput().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cb0cc8685076..a29ef169f360 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1204,7 +1204,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 static int
 loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 {
-	struct file *file;
+	struct path path;
 	struct kstat stat;
 	int ret;
 
@@ -1229,16 +1229,16 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	}
 
 	/* Drop lo_ctl_mutex while we call into the filesystem. */
-	file = get_file(lo->lo_backing_file);
+	path = lo->lo_backing_file->f_path;
+	path_get(&path);
 	mutex_unlock(&lo->lo_ctl_mutex);
-	ret = vfs_getattr(&file->f_path, &stat, STATX_INO,
-			  AT_STATX_SYNC_AS_STAT);
+	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (!ret) {
 		info->lo_device = huge_encode_dev(stat.dev);
 		info->lo_inode = stat.ino;
 		info->lo_rdevice = huge_encode_dev(stat.rdev);
 	}
-	fput(file);
+	path_put(&path);
 	return ret;
 }
 

From 310ca162d779efee8a2dc3731439680f3e9c1e86 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 8 Nov 2018 14:01:02 +0100
Subject: [PATCH 057/351] block/loop: Use global lock for ioctl() operation.

syzbot is reporting NULL pointer dereference [1] which is caused by
race condition between ioctl(loop_fd, LOOP_CLR_FD, 0) versus
ioctl(other_loop_fd, LOOP_SET_FD, loop_fd) due to traversing other
loop devices at loop_validate_file() without holding corresponding
lo->lo_ctl_mutex locks.

Since ioctl() request on loop devices is not frequent operation, we don't
need fine grained locking. Let's use global lock in order to allow safe
traversal at loop_validate_file().

Note that syzbot is also reporting circular locking dependency between
bdev->bd_mutex and lo->lo_ctl_mutex [2] which is caused by calling
blkdev_reread_part() with lock held. This patch does not address it.

[1] https://syzkaller.appspot.com/bug?id=f3cfe26e785d85f9ee259f385515291d21bd80a3
[2] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d15889

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <syzbot+bf89c128e05dd6c62523@syzkaller.appspotmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 58 ++++++++++++++++++++++----------------------
 drivers/block/loop.h |  1 -
 2 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index a29ef169f360..63008e879771 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -84,6 +84,7 @@
 
 static DEFINE_IDR(loop_index_idr);
 static DEFINE_MUTEX(loop_index_mutex);
+static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
 static int part_shift;
@@ -1046,7 +1047,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	 */
 	if (atomic_read(&lo->lo_refcnt) > 1) {
 		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return 0;
 	}
 
@@ -1099,12 +1100,12 @@ static int loop_clr_fd(struct loop_device *lo)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	loop_unprepare_queue(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	/*
-	 * Need not hold lo_ctl_mutex to fput backing file.
-	 * Calling fput holding lo_ctl_mutex triggers a circular
+	 * Need not hold loop_ctl_mutex to fput backing file.
+	 * Calling fput holding loop_ctl_mutex triggers a circular
 	 * lock dependency possibility warning as fput can take
-	 * bd_mutex which is usually taken before lo_ctl_mutex.
+	 * bd_mutex which is usually taken before loop_ctl_mutex.
 	 */
 	fput(filp);
 	return 0;
@@ -1209,7 +1210,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	int ret;
 
 	if (lo->lo_state != Lo_bound) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
 	}
 
@@ -1228,10 +1229,10 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 		       lo->lo_encrypt_key_size);
 	}
 
-	/* Drop lo_ctl_mutex while we call into the filesystem. */
+	/* Drop loop_ctl_mutex while we call into the filesystem. */
 	path = lo->lo_backing_file->f_path;
 	path_get(&path);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
 	if (!ret) {
 		info->lo_device = huge_encode_dev(stat.dev);
@@ -1323,7 +1324,7 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1341,7 +1342,7 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1399,7 +1400,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
-	err = mutex_lock_killable_nested(&lo->lo_ctl_mutex, 1);
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (err)
 		goto out_unlocked;
 
@@ -1411,7 +1412,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		err = loop_change_fd(lo, bdev, arg);
 		break;
 	case LOOP_CLR_FD:
-		/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
 		err = loop_clr_fd(lo);
 		if (!err)
 			goto out_unlocked;
@@ -1424,7 +1425,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-		/* loop_get_status() unlocks lo_ctl_mutex */
+		/* loop_get_status() unlocks loop_ctl_mutex */
 		goto out_unlocked;
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
@@ -1434,7 +1435,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		break;
 	case LOOP_GET_STATUS64:
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
-		/* loop_get_status() unlocks lo_ctl_mutex */
+		/* loop_get_status() unlocks loop_ctl_mutex */
 		goto out_unlocked;
 	case LOOP_SET_CAPACITY:
 		err = -EPERM;
@@ -1454,7 +1455,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	default:
 		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
 	}
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 out_unlocked:
 	return err;
@@ -1571,7 +1572,7 @@ loop_get_status_compat(struct loop_device *lo,
 	int err;
 
 	if (!arg) {
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		return -EINVAL;
 	}
 	err = loop_get_status(lo, &info64);
@@ -1588,19 +1589,19 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&loop_ctl_mutex);
 		if (!err) {
 			err = loop_set_status_compat(lo,
 						     (const struct compat_loop_info __user *)arg);
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable(&lo->lo_ctl_mutex);
+		err = mutex_lock_killable(&loop_ctl_mutex);
 		if (!err) {
 			err = loop_get_status_compat(lo,
 						     (struct compat_loop_info __user *)arg);
-			/* loop_get_status() unlocks lo_ctl_mutex */
+			/* loop_get_status() unlocks loop_ctl_mutex */
 		}
 		break;
 	case LOOP_SET_CAPACITY:
@@ -1647,7 +1648,7 @@ static void __lo_release(struct loop_device *lo)
 	if (atomic_dec_return(&lo->lo_refcnt))
 		return;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1665,7 +1666,7 @@ static void __lo_release(struct loop_device *lo)
 		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 }
 
 static void lo_release(struct gendisk *disk, fmode_t mode)
@@ -1711,10 +1712,10 @@ static int unregister_transfer_cb(int id, void *ptr, void *data)
 	struct loop_device *lo = ptr;
 	struct loop_func_table *xfer = data;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_encryption == xfer)
 		loop_release_xfer(lo);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 }
 
@@ -1895,7 +1896,6 @@ static int loop_add(struct loop_device **l, int i)
 	if (!part_shift)
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	disk->flags |= GENHD_FL_EXT_DEVT;
-	mutex_init(&lo->lo_ctl_mutex);
 	atomic_set(&lo->lo_refcnt, 0);
 	lo->lo_number		= i;
 	spin_lock_init(&lo->lo_lock);
@@ -2008,21 +2008,21 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		ret = mutex_lock_killable(&lo->lo_ctl_mutex);
+		ret = mutex_lock_killable(&loop_ctl_mutex);
 		if (ret)
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&lo->lo_ctl_mutex);
+			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&lo->lo_ctl_mutex);
+		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 4d42c7af7de7..af75a5ee4094 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -54,7 +54,6 @@ struct loop_device {
 
 	spinlock_t		lo_lock;
 	int			lo_state;
-	struct mutex		lo_ctl_mutex;
 	struct kthread_worker	worker;
 	struct task_struct	*worker_task;
 	bool			use_dio;

From 967d1dc144b50ad005e5eecdfadfbcfb399ffff6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:03 +0100
Subject: [PATCH 058/351] loop: Fold __loop_release into loop_release

__loop_release() has a single call site. Fold it there. This is
currently not a huge win but it will make following replacement of
loop_index_mutex more obvious.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 63008e879771..3de2cd94225a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1641,12 +1641,15 @@ out:
 	return err;
 }
 
-static void __lo_release(struct loop_device *lo)
+static void lo_release(struct gendisk *disk, fmode_t mode)
 {
+	struct loop_device *lo;
 	int err;
 
+	mutex_lock(&loop_index_mutex);
+	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		return;
+		goto unlock_index;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
@@ -1656,7 +1659,7 @@ static void __lo_release(struct loop_device *lo)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			return;
+			goto unlock_index;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1667,12 +1670,7 @@ static void __lo_release(struct loop_device *lo)
 	}
 
 	mutex_unlock(&loop_ctl_mutex);
-}
-
-static void lo_release(struct gendisk *disk, fmode_t mode)
-{
-	mutex_lock(&loop_index_mutex);
-	__lo_release(disk->private_data);
+unlock_index:
 	mutex_unlock(&loop_index_mutex);
 }
 

From 0a42e99b58a208839626465af194cfe640ef9493 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:04 +0100
Subject: [PATCH 059/351] loop: Get rid of loop_index_mutex

Now that loop_ctl_mutex is global, just get rid of loop_index_mutex as
there is no good reason to keep these two separate and it just
complicates the locking.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3de2cd94225a..dcdc96f4d2d4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -83,7 +83,6 @@
 #include <linux/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
-static DEFINE_MUTEX(loop_index_mutex);
 static DEFINE_MUTEX(loop_ctl_mutex);
 
 static int max_part;
@@ -1626,9 +1625,11 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err = 0;
+	int err;
 
-	mutex_lock(&loop_index_mutex);
+	err = mutex_lock_killable(&loop_ctl_mutex);
+	if (err)
+		return err;
 	lo = bdev->bd_disk->private_data;
 	if (!lo) {
 		err = -ENXIO;
@@ -1637,7 +1638,7 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
 
 	atomic_inc(&lo->lo_refcnt);
 out:
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 	return err;
 }
 
@@ -1646,12 +1647,11 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 	struct loop_device *lo;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	lo = disk->private_data;
 	if (atomic_dec_return(&lo->lo_refcnt))
-		goto unlock_index;
+		goto out_unlock;
 
-	mutex_lock(&loop_ctl_mutex);
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
 		/*
 		 * In autoclear mode, stop the loop thread
@@ -1659,7 +1659,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			goto unlock_index;
+			return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1669,9 +1669,8 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		blk_mq_unfreeze_queue(lo->lo_queue);
 	}
 
+out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
-unlock_index:
-	mutex_unlock(&loop_index_mutex);
 }
 
 static const struct block_device_operations lo_fops = {
@@ -1972,7 +1971,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	struct kobject *kobj;
 	int err;
 
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
 	if (err < 0)
 		err = loop_add(&lo, MINOR(dev) >> part_shift);
@@ -1980,7 +1979,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 		kobj = NULL;
 	else
 		kobj = get_disk_and_module(lo->lo_disk);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	*part = 0;
 	return kobj;
@@ -1990,9 +1989,13 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			       unsigned long parm)
 {
 	struct loop_device *lo;
-	int ret = -ENOSYS;
+	int ret;
 
-	mutex_lock(&loop_index_mutex);
+	ret = mutex_lock_killable(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -ENOSYS;
 	switch (cmd) {
 	case LOOP_CTL_ADD:
 		ret = loop_lookup(&lo, parm);
@@ -2006,9 +2009,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 		ret = loop_lookup(&lo, parm);
 		if (ret < 0)
 			break;
-		ret = mutex_lock_killable(&loop_ctl_mutex);
-		if (ret)
-			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
 			mutex_unlock(&loop_ctl_mutex);
@@ -2020,7 +2020,6 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		}
 		lo->lo_disk->private_data = NULL;
-		mutex_unlock(&loop_ctl_mutex);
 		idr_remove(&loop_index_idr, lo->lo_number);
 		loop_remove(lo);
 		break;
@@ -2030,7 +2029,7 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		ret = loop_add(&lo, -1);
 	}
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	return ret;
 }
@@ -2114,10 +2113,10 @@ static int __init loop_init(void)
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
 	/* pre-create number of devices given by config or max_loop */
-	mutex_lock(&loop_index_mutex);
+	mutex_lock(&loop_ctl_mutex);
 	for (i = 0; i < nr; i++)
 		loop_add(&lo, i);
-	mutex_unlock(&loop_index_mutex);
+	mutex_unlock(&loop_ctl_mutex);
 
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;

From a13165441d58b216adbd50252a9cc829d78a6bce Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:05 +0100
Subject: [PATCH 060/351] loop: Push lo_ctl_mutex down into individual ioctls

Push acquisition of lo_ctl_mutex down into individual ioctl handling
branches. This is a preparatory step for pushing the lock down into
individual ioctl handling functions so that they can release the lock as
they need it. We also factor out some simple ioctl handlers that will
not need any special handling to reduce unnecessary code duplication.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 88 +++++++++++++++++++++++++++++++-------------
 1 file changed, 63 insertions(+), 25 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index dcdc96f4d2d4..4c37578989c4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1393,70 +1393,108 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
 	return 0;
 }
 
+static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
+			   unsigned long arg)
+{
+	int err;
+
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (err)
+		return err;
+	switch (cmd) {
+	case LOOP_SET_CAPACITY:
+		err = loop_set_capacity(lo);
+		break;
+	case LOOP_SET_DIRECT_IO:
+		err = loop_set_dio(lo, arg);
+		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = loop_set_block_size(lo, arg);
+		break;
+	default:
+		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+	}
+	mutex_unlock(&loop_ctl_mutex);
+	return err;
+}
+
 static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	unsigned int cmd, unsigned long arg)
 {
 	struct loop_device *lo = bdev->bd_disk->private_data;
 	int err;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-	if (err)
-		goto out_unlocked;
-
 	switch (cmd) {
 	case LOOP_SET_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_set_fd(lo, mode, bdev, arg);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CHANGE_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_change_fd(lo, bdev, arg);
+		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CLR_FD:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
 		err = loop_clr_fd(lo);
-		if (!err)
-			goto out_unlocked;
+		if (err)
+			mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_SET_STATUS:
 		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+			if (err)
+				return err;
 			err = loop_set_status_old(lo,
 					(struct loop_info __user *)arg);
+			mutex_unlock(&loop_ctl_mutex);
+		}
 		break;
 	case LOOP_GET_STATUS:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
 		/* loop_get_status() unlocks loop_ctl_mutex */
-		goto out_unlocked;
+		break;
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+			if (err)
+				return err;
 			err = loop_set_status64(lo,
 					(struct loop_info64 __user *) arg);
+			mutex_unlock(&loop_ctl_mutex);
+		}
 		break;
 	case LOOP_GET_STATUS64:
+		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+		if (err)
+			return err;
 		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
 		/* loop_get_status() unlocks loop_ctl_mutex */
-		goto out_unlocked;
+		break;
 	case LOOP_SET_CAPACITY:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_capacity(lo);
-		break;
 	case LOOP_SET_DIRECT_IO:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_dio(lo, arg);
-		break;
 	case LOOP_SET_BLOCK_SIZE:
-		err = -EPERM;
-		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
-			err = loop_set_block_size(lo, arg);
-		break;
+		if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		/* Fall through */
 	default:
-		err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+		err = lo_simple_ioctl(lo, cmd, arg);
+		break;
 	}
-	mutex_unlock(&loop_ctl_mutex);
 
-out_unlocked:
 	return err;
 }
 

From a2505b799a496b7b84d9a4a14ec870ff9e42e11b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:06 +0100
Subject: [PATCH 061/351] loop: Split setting of lo_state from loop_clr_fd

Move setting of lo_state to Lo_rundown out into the callers. That will
allow us to unlock loop_ctl_mutex while the loop device is protected
from other changes by its special state.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 52 ++++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 4c37578989c4..eb01a685da4e 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -975,7 +975,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 		loop_reread_partitions(lo, bdev);
 
 	/* Grab the block_device to prevent its destruction after we
-	 * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
 	return 0;
@@ -1025,31 +1025,15 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int loop_clr_fd(struct loop_device *lo)
+static int __loop_clr_fd(struct loop_device *lo)
 {
 	struct file *filp = lo->lo_backing_file;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
 
-	if (lo->lo_state != Lo_bound)
+	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown))
 		return -ENXIO;
 
-	/*
-	 * If we've explicitly asked to tear down the loop device,
-	 * and it has an elevated reference count, set it for auto-teardown when
-	 * the last reference goes away. This stops $!~#$@ udev from
-	 * preventing teardown because it decided that it needs to run blkid on
-	 * the loopback device whenever they appear. xfstests is notorious for
-	 * failing tests because blkid via udev races with a losetup
-	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
-	 * command to fail with EBUSY.
-	 */
-	if (atomic_read(&lo->lo_refcnt) > 1) {
-		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
-		mutex_unlock(&loop_ctl_mutex);
-		return 0;
-	}
-
 	if (filp == NULL)
 		return -EINVAL;
 
@@ -1057,7 +1041,6 @@ static int loop_clr_fd(struct loop_device *lo)
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	spin_lock_irq(&lo->lo_lock);
-	lo->lo_state = Lo_rundown;
 	lo->lo_backing_file = NULL;
 	spin_unlock_irq(&lo->lo_lock);
 
@@ -1110,6 +1093,30 @@ static int loop_clr_fd(struct loop_device *lo)
 	return 0;
 }
 
+static int loop_clr_fd(struct loop_device *lo)
+{
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+	/*
+	 * If we've explicitly asked to tear down the loop device,
+	 * and it has an elevated reference count, set it for auto-teardown when
+	 * the last reference goes away. This stops $!~#$@ udev from
+	 * preventing teardown because it decided that it needs to run blkid on
+	 * the loopback device whenever they appear. xfstests is notorious for
+	 * failing tests because blkid via udev races with a losetup
+	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+	 * command to fail with EBUSY.
+	 */
+	if (atomic_read(&lo->lo_refcnt) > 1) {
+		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+		mutex_unlock(&loop_ctl_mutex);
+		return 0;
+	}
+	lo->lo_state = Lo_rundown;
+
+	return __loop_clr_fd(lo);
+}
+
 static int
 loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 {
@@ -1691,11 +1698,14 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		goto out_unlock;
 
 	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+		if (lo->lo_state != Lo_bound)
+			goto out_unlock;
+		lo->lo_state = Lo_rundown;
 		/*
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = loop_clr_fd(lo);
+		err = __loop_clr_fd(lo);
 		if (!err)
 			return;
 	} else if (lo->lo_state == Lo_bound) {

From 7ccd0791d98531df7cd59e92d55e4f063d48a070 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:07 +0100
Subject: [PATCH 062/351] loop: Push loop_ctl_mutex down into loop_clr_fd()

loop_clr_fd() has a weird locking convention that is expects
loop_ctl_mutex held, releases it on success and keeps it on failure.
Untangle the mess by moving locking of loop_ctl_mutex into
loop_clr_fd().

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 49 ++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index eb01a685da4e..d8a7b5da881b 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1027,15 +1027,22 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 
 static int __loop_clr_fd(struct loop_device *lo)
 {
-	struct file *filp = lo->lo_backing_file;
+	struct file *filp = NULL;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
+	int err = 0;
 
-	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown))
-		return -ENXIO;
+	mutex_lock(&loop_ctl_mutex);
+	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
+		err = -ENXIO;
+		goto out_unlock;
+	}
 
-	if (filp == NULL)
-		return -EINVAL;
+	filp = lo->lo_backing_file;
+	if (filp == NULL) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 
 	/* freeze request queue during the transition */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -1082,6 +1089,7 @@ static int __loop_clr_fd(struct loop_device *lo)
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
 	loop_unprepare_queue(lo);
+out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
 	/*
 	 * Need not hold loop_ctl_mutex to fput backing file.
@@ -1089,14 +1097,22 @@ static int __loop_clr_fd(struct loop_device *lo)
 	 * lock dependency possibility warning as fput can take
 	 * bd_mutex which is usually taken before loop_ctl_mutex.
 	 */
-	fput(filp);
-	return 0;
+	if (filp)
+		fput(filp);
+	return err;
 }
 
 static int loop_clr_fd(struct loop_device *lo)
 {
-	if (lo->lo_state != Lo_bound)
+	int err;
+
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (err)
+		return err;
+	if (lo->lo_state != Lo_bound) {
+		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
+	}
 	/*
 	 * If we've explicitly asked to tear down the loop device,
 	 * and it has an elevated reference count, set it for auto-teardown when
@@ -1113,6 +1129,7 @@ static int loop_clr_fd(struct loop_device *lo)
 		return 0;
 	}
 	lo->lo_state = Lo_rundown;
+	mutex_unlock(&loop_ctl_mutex);
 
 	return __loop_clr_fd(lo);
 }
@@ -1447,14 +1464,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		mutex_unlock(&loop_ctl_mutex);
 		break;
 	case LOOP_CLR_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		/* loop_clr_fd would have unlocked loop_ctl_mutex on success */
-		err = loop_clr_fd(lo);
-		if (err)
-			mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_clr_fd(lo);
 	case LOOP_SET_STATUS:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
@@ -1690,7 +1700,6 @@ out:
 static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo;
-	int err;
 
 	mutex_lock(&loop_ctl_mutex);
 	lo = disk->private_data;
@@ -1701,13 +1710,13 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		if (lo->lo_state != Lo_bound)
 			goto out_unlock;
 		lo->lo_state = Lo_rundown;
+		mutex_unlock(&loop_ctl_mutex);
 		/*
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		err = __loop_clr_fd(lo);
-		if (!err)
-			return;
+		__loop_clr_fd(lo);
+		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*
 		 * Otherwise keep thread (if running) and config,

From 4a5ce9ba5877e4640200d84a735361306ad1a1b8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:08 +0100
Subject: [PATCH 063/351] loop: Push loop_ctl_mutex down to loop_get_status()

Push loop_ctl_mutex down to loop_get_status() to avoid the unusual
convention that the function gets called with loop_ctl_mutex held and
releases it.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 37 ++++++++++---------------------------
 1 file changed, 10 insertions(+), 27 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index d8a7b5da881b..2e814f8af4df 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1232,6 +1232,9 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	struct kstat stat;
 	int ret;
 
+	ret = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (ret)
+		return ret;
 	if (lo->lo_state != Lo_bound) {
 		mutex_unlock(&loop_ctl_mutex);
 		return -ENXIO;
@@ -1346,10 +1349,8 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_old(&info64, &info);
@@ -1364,10 +1365,8 @@ loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
 		err = -EFAULT;
@@ -1477,12 +1476,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_get_status_old(lo, (struct loop_info __user *) arg);
-		/* loop_get_status() unlocks loop_ctl_mutex */
-		break;
+		return loop_get_status_old(lo, (struct loop_info __user *) arg);
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
@@ -1495,12 +1489,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS64:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
-		/* loop_get_status() unlocks loop_ctl_mutex */
-		break;
+		return loop_get_status64(lo, (struct loop_info64 __user *) arg);
 	case LOOP_SET_CAPACITY:
 	case LOOP_SET_DIRECT_IO:
 	case LOOP_SET_BLOCK_SIZE:
@@ -1625,10 +1614,8 @@ loop_get_status_compat(struct loop_device *lo,
 	struct loop_info64 info64;
 	int err;
 
-	if (!arg) {
-		mutex_unlock(&loop_ctl_mutex);
+	if (!arg)
 		return -EINVAL;
-	}
 	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_compat(&info64, arg);
@@ -1651,12 +1638,8 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 		}
 		break;
 	case LOOP_GET_STATUS:
-		err = mutex_lock_killable(&loop_ctl_mutex);
-		if (!err) {
-			err = loop_get_status_compat(lo,
-						     (struct compat_loop_info __user *)arg);
-			/* loop_get_status() unlocks loop_ctl_mutex */
-		}
+		err = loop_get_status_compat(lo,
+				     (struct compat_loop_info __user *)arg);
 		break;
 	case LOOP_SET_CAPACITY:
 	case LOOP_CLR_FD:

From 550df5fdacff94229cde0ed9b8085155654c1696 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:09 +0100
Subject: [PATCH 064/351] loop: Push loop_ctl_mutex down to loop_set_status()

Push loop_ctl_mutex down to loop_set_status(). We will need this to be
able to call loop_reread_partitions() without loop_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 51 ++++++++++++++++++++++----------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 2e814f8af4df..af79a59732b7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1141,46 +1141,55 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
 
+	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (err)
+		return err;
 	if (lo->lo_encrypt_key_size &&
 	    !uid_eq(lo->lo_key_owner, uid) &&
-	    !capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	if (lo->lo_state != Lo_bound)
-		return -ENXIO;
-	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
-		return -EINVAL;
+	    !capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out_unlock;
+	}
+	if (lo->lo_state != Lo_bound) {
+		err = -ENXIO;
+		goto out_unlock;
+	}
+	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) {
+		err = -EINVAL;
+		goto out_unlock;
+	}
 
 	/* I/O need to be drained during transfer transition */
 	blk_mq_freeze_queue(lo->lo_queue);
 
 	err = loop_release_xfer(lo);
 	if (err)
-		goto exit;
+		goto out_unfreeze;
 
 	if (info->lo_encrypt_type) {
 		unsigned int type = info->lo_encrypt_type;
 
 		if (type >= MAX_LO_CRYPT) {
 			err = -EINVAL;
-			goto exit;
+			goto out_unfreeze;
 		}
 		xfer = xfer_funcs[type];
 		if (xfer == NULL) {
 			err = -EINVAL;
-			goto exit;
+			goto out_unfreeze;
 		}
 	} else
 		xfer = NULL;
 
 	err = loop_init_xfer(lo, xfer, info);
 	if (err)
-		goto exit;
+		goto out_unfreeze;
 
 	if (lo->lo_offset != info->lo_offset ||
 	    lo->lo_sizelimit != info->lo_sizelimit) {
 		if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) {
 			err = -EFBIG;
-			goto exit;
+			goto out_unfreeze;
 		}
 	}
 
@@ -1212,7 +1221,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	/* update dio if lo_offset or transfer is changed */
 	__loop_update_dio(lo, lo->use_dio);
 
- exit:
+out_unfreeze:
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
@@ -1221,6 +1230,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
 		loop_reread_partitions(lo, lo->lo_device);
 	}
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
 
 	return err;
 }
@@ -1467,12 +1478,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_STATUS:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
-			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-			if (err)
-				return err;
 			err = loop_set_status_old(lo,
 					(struct loop_info __user *)arg);
-			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS:
@@ -1480,12 +1487,8 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_STATUS64:
 		err = -EPERM;
 		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
-			err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-			if (err)
-				return err;
 			err = loop_set_status64(lo,
 					(struct loop_info64 __user *) arg);
-			mutex_unlock(&loop_ctl_mutex);
 		}
 		break;
 	case LOOP_GET_STATUS64:
@@ -1630,12 +1633,8 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch(cmd) {
 	case LOOP_SET_STATUS:
-		err = mutex_lock_killable(&loop_ctl_mutex);
-		if (!err) {
-			err = loop_set_status_compat(lo,
-						     (const struct compat_loop_info __user *)arg);
-			mutex_unlock(&loop_ctl_mutex);
-		}
+		err = loop_set_status_compat(lo,
+			     (const struct compat_loop_info __user *)arg);
 		break;
 	case LOOP_GET_STATUS:
 		err = loop_get_status_compat(lo,

From 757ecf40b7e029529768eb5f9562d5eeb3002106 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:10 +0100
Subject: [PATCH 065/351] loop: Push loop_ctl_mutex down to loop_set_fd()

Push lo_ctl_mutex down to loop_set_fd(). We will need this to be able to
call loop_reread_partitions() without lo_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index af79a59732b7..161e2a08f2e8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -918,13 +918,17 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (!file)
 		goto out;
 
+	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (error)
+		goto out_putf;
+
 	error = -EBUSY;
 	if (lo->lo_state != Lo_unbound)
-		goto out_putf;
+		goto out_unlock;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
-		goto out_putf;
+		goto out_unlock;
 
 	mapping = file->f_mapping;
 	inode = mapping->host;
@@ -936,10 +940,10 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	error = -EFBIG;
 	size = get_loop_size(lo, file);
 	if ((loff_t)(sector_t)size != size)
-		goto out_putf;
+		goto out_unlock;
 	error = loop_prepare_queue(lo);
 	if (error)
-		goto out_putf;
+		goto out_unlock;
 
 	error = 0;
 
@@ -978,11 +982,14 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 
- out_putf:
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
+out_putf:
 	fput(file);
- out:
+out:
 	/* This is safe: open() is still holding a reference. */
 	module_put(THIS_MODULE);
 	return error;
@@ -1460,12 +1467,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 
 	switch (cmd) {
 	case LOOP_SET_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_set_fd(lo, mode, bdev, arg);
-		mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_set_fd(lo, mode, bdev, arg);
 	case LOOP_CHANGE_FD:
 		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 		if (err)

From c371077000f4138ee3c15fbed50101ff24bdc91d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:11 +0100
Subject: [PATCH 066/351] loop: Push loop_ctl_mutex down to loop_change_fd()

Push loop_ctl_mutex down to loop_change_fd(). We will need this to be
able to call loop_reread_partitions() without loop_ctl_mutex.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 161e2a08f2e8..ea5e313908b1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -691,19 +691,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	struct file	*file, *old_file;
 	int		error;
 
+	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	if (error)
+		return error;
 	error = -ENXIO;
 	if (lo->lo_state != Lo_bound)
-		goto out;
+		goto out_unlock;
 
 	/* the loop device has to be read-only */
 	error = -EINVAL;
 	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
-		goto out;
+		goto out_unlock;
 
 	error = -EBADF;
 	file = fget(arg);
 	if (!file)
-		goto out;
+		goto out_unlock;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
@@ -730,11 +733,13 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	fput(old_file);
 	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
 		loop_reread_partitions(lo, bdev);
+	mutex_unlock(&loop_ctl_mutex);
 	return 0;
 
- out_putf:
+out_putf:
 	fput(file);
- out:
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
 	return error;
 }
 
@@ -1469,12 +1474,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode,
 	case LOOP_SET_FD:
 		return loop_set_fd(lo, mode, bdev, arg);
 	case LOOP_CHANGE_FD:
-		err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
-		if (err)
-			return err;
-		err = loop_change_fd(lo, bdev, arg);
-		mutex_unlock(&loop_ctl_mutex);
-		break;
+		return loop_change_fd(lo, bdev, arg);
 	case LOOP_CLR_FD:
 		return loop_clr_fd(lo);
 	case LOOP_SET_STATUS:

From d57f3374ba4817f7c8d26fae8a13d20ac8d31b92 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:12 +0100
Subject: [PATCH 067/351] loop: Move special partition reread handling in
 loop_clr_fd()

The call of __blkdev_reread_part() from loop_reread_partition() happens
only when we need to invalidate partitions from loop_release(). Thus
move a detection for this into loop_clr_fd() and simplify
loop_reread_partition().

This makes loop_reread_partition() safe to use without loop_ctl_mutex
because we use only lo->lo_number and lo->lo_file_name in case of error
for reporting purposes (thus possibly reporting outdate information is
not a big deal) and we are safe from 'lo' going away under us by
elevated lo->lo_refcnt.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ea5e313908b1..f1d7a4fe30fc 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -630,18 +630,7 @@ static void loop_reread_partitions(struct loop_device *lo,
 {
 	int rc;
 
-	/*
-	 * bd_mutex has been held already in release path, so don't
-	 * acquire it if this function is called in such case.
-	 *
-	 * If the reread partition isn't from release path, lo_refcnt
-	 * must be at least one and it can only become zero when the
-	 * current holder is released.
-	 */
-	if (!atomic_read(&lo->lo_refcnt))
-		rc = __blkdev_reread_part(bdev);
-	else
-		rc = blkdev_reread_part(bdev);
+	rc = blkdev_reread_part(bdev);
 	if (rc)
 		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
 			__func__, lo->lo_number, lo->lo_file_name, rc);
@@ -1095,8 +1084,24 @@ static int __loop_clr_fd(struct loop_device *lo)
 	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
-		loop_reread_partitions(lo, bdev);
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) {
+		/*
+		 * bd_mutex has been held already in release path, so don't
+		 * acquire it if this function is called in such case.
+		 *
+		 * If the reread partition isn't from release path, lo_refcnt
+		 * must be at least one and it can only become zero when the
+		 * current holder is released.
+		 */
+		if (!atomic_read(&lo->lo_refcnt))
+			err = __blkdev_reread_part(bdev);
+		else
+			err = blkdev_reread_part(bdev);
+		pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+			__func__, lo->lo_number, err);
+		/* Device is gone, no point in returning error */
+		err = 0;
+	}
 	lo->lo_flags = 0;
 	if (!part_shift)
 		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;

From 85b0a54a82e4fbceeb1aebb7cb6909edd1a24668 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:13 +0100
Subject: [PATCH 068/351] loop: Move loop_reread_partitions() out of
 loop_ctl_mutex

Calling loop_reread_partitions() under loop_ctl_mutex causes lockdep to
complain about circular lock dependency between bdev->bd_mutex and
lo->lo_ctl_mutex. The problem is that on loop device open or close
lo_open() and lo_release() get called with bdev->bd_mutex held and they
need to acquire loop_ctl_mutex. OTOH when loop_reread_partitions() is
called with loop_ctl_mutex held, it will call blkdev_reread_part() which
acquires bdev->bd_mutex. See syzbot report for details [1].

Move all calls of loop_rescan_partitions() out of loop_ctl_mutex to
avoid lockdep warning and fix deadlock possibility.

[1] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d1588

Reported-by: syzbot <syzbot+4684a000d5abdade83fac55b1e7d1f935ef1936e@syzkaller.appspotmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f1d7a4fe30fc..cce5d4e8e863 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -679,6 +679,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 {
 	struct file	*file, *old_file;
 	int		error;
+	bool		partscan;
 
 	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (error)
@@ -720,9 +721,10 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
 	fput(old_file);
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		loop_reread_partitions(lo, bdev);
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 	return 0;
 
 out_putf:
@@ -903,6 +905,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	int		lo_flags = 0;
 	int		error;
 	loff_t		size;
+	bool		partscan;
 
 	/* This is safe, since we have a reference from open(). */
 	__module_get(THIS_MODULE);
@@ -969,14 +972,15 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->lo_state = Lo_bound;
 	if (part_shift)
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN)
-		loop_reread_partitions(lo, bdev);
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 
 	/* Grab the block_device to prevent its destruction after we
 	 * put /dev/loopXX inode. Later in __loop_clr_fd() we bdput(bdev).
 	 */
 	bdgrab(bdev);
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 	return 0;
 
 out_unlock:
@@ -1157,6 +1161,8 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	int err;
 	struct loop_func_table *xfer;
 	kuid_t uid = current_uid();
+	struct block_device *bdev;
+	bool partscan = false;
 
 	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
 	if (err)
@@ -1245,10 +1251,13 @@ out_unfreeze:
 	     !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
 		lo->lo_flags |= LO_FLAGS_PARTSCAN;
 		lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
-		loop_reread_partitions(lo, lo->lo_device);
+		bdev = lo->lo_device;
+		partscan = true;
 	}
 out_unlock:
 	mutex_unlock(&loop_ctl_mutex);
+	if (partscan)
+		loop_reread_partitions(lo, bdev);
 
 	return err;
 }

From 0da03cab87e6323ff2e05b14bc7d5c6fcc531efd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:14 +0100
Subject: [PATCH 069/351] loop: Fix deadlock when calling blkdev_reread_part()

Calling blkdev_reread_part() under loop_ctl_mutex causes lockdep to
complain about circular lock dependency between bdev->bd_mutex and
lo->lo_ctl_mutex. The problem is that on loop device open or close
lo_open() and lo_release() get called with bdev->bd_mutex held and they
need to acquire loop_ctl_mutex. OTOH when loop_reread_partitions() is
called with loop_ctl_mutex held, it will call blkdev_reread_part() which
acquires bdev->bd_mutex. See syzbot report for details [1].

Move call to blkdev_reread_part() in __loop_clr_fd() from under
loop_ctl_mutex to finish fixing of the lockdep warning and the possible
deadlock.

[1] https://syzkaller.appspot.com/bug?id=bf154052f0eea4bc7712499e4569505907d1588

Reported-by: syzbot <syzbot+4684a000d5abdade83fac55b1e7d1f935ef1936e@syzkaller.appspotmail.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cce5d4e8e863..b3f981ac8ef1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1030,12 +1030,14 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
 	return err;
 }
 
-static int __loop_clr_fd(struct loop_device *lo)
+static int __loop_clr_fd(struct loop_device *lo, bool release)
 {
 	struct file *filp = NULL;
 	gfp_t gfp = lo->old_gfp_mask;
 	struct block_device *bdev = lo->lo_device;
 	int err = 0;
+	bool partscan = false;
+	int lo_number;
 
 	mutex_lock(&loop_ctl_mutex);
 	if (WARN_ON_ONCE(lo->lo_state != Lo_rundown)) {
@@ -1088,7 +1090,15 @@ static int __loop_clr_fd(struct loop_device *lo)
 	module_put(THIS_MODULE);
 	blk_mq_unfreeze_queue(lo->lo_queue);
 
-	if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) {
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN && bdev;
+	lo_number = lo->lo_number;
+	lo->lo_flags = 0;
+	if (!part_shift)
+		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+	loop_unprepare_queue(lo);
+out_unlock:
+	mutex_unlock(&loop_ctl_mutex);
+	if (partscan) {
 		/*
 		 * bd_mutex has been held already in release path, so don't
 		 * acquire it if this function is called in such case.
@@ -1097,21 +1107,15 @@ static int __loop_clr_fd(struct loop_device *lo)
 		 * must be at least one and it can only become zero when the
 		 * current holder is released.
 		 */
-		if (!atomic_read(&lo->lo_refcnt))
+		if (release)
 			err = __blkdev_reread_part(bdev);
 		else
 			err = blkdev_reread_part(bdev);
 		pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
-			__func__, lo->lo_number, err);
+			__func__, lo_number, err);
 		/* Device is gone, no point in returning error */
 		err = 0;
 	}
-	lo->lo_flags = 0;
-	if (!part_shift)
-		lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
-	loop_unprepare_queue(lo);
-out_unlock:
-	mutex_unlock(&loop_ctl_mutex);
 	/*
 	 * Need not hold loop_ctl_mutex to fput backing file.
 	 * Calling fput holding loop_ctl_mutex triggers a circular
@@ -1152,7 +1156,7 @@ static int loop_clr_fd(struct loop_device *lo)
 	lo->lo_state = Lo_rundown;
 	mutex_unlock(&loop_ctl_mutex);
 
-	return __loop_clr_fd(lo);
+	return __loop_clr_fd(lo, false);
 }
 
 static int
@@ -1713,7 +1717,7 @@ static void lo_release(struct gendisk *disk, fmode_t mode)
 		 * In autoclear mode, stop the loop thread
 		 * and remove configuration after last close.
 		 */
-		__loop_clr_fd(lo);
+		__loop_clr_fd(lo, true);
 		return;
 	} else if (lo->lo_state == Lo_bound) {
 		/*

From 1dded9acf6dc9a34cd27fcf8815507e4e65b3c4f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:15 +0100
Subject: [PATCH 070/351] loop: Avoid circular locking dependency between
 loop_ctl_mutex and bd_mutex

Code in loop_change_fd() drops reference to the old file (and also the
new file in a failure case) under loop_ctl_mutex. Similarly to a
situation in loop_set_fd() this can create a circular locking dependency
if this was the last reference holding the file open. Delay dropping of
the file reference until we have released loop_ctl_mutex.

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b3f981ac8ef1..112afc9bc604 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -677,7 +677,7 @@ static int loop_validate_file(struct file *file, struct block_device *bdev)
 static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 			  unsigned int arg)
 {
-	struct file	*file, *old_file;
+	struct file	*file = NULL, *old_file;
 	int		error;
 	bool		partscan;
 
@@ -686,21 +686,21 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 		return error;
 	error = -ENXIO;
 	if (lo->lo_state != Lo_bound)
-		goto out_unlock;
+		goto out_err;
 
 	/* the loop device has to be read-only */
 	error = -EINVAL;
 	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
-		goto out_unlock;
+		goto out_err;
 
 	error = -EBADF;
 	file = fget(arg);
 	if (!file)
-		goto out_unlock;
+		goto out_err;
 
 	error = loop_validate_file(file, bdev);
 	if (error)
-		goto out_putf;
+		goto out_err;
 
 	old_file = lo->lo_backing_file;
 
@@ -708,7 +708,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 
 	/* size of the new backing store needs to be the same */
 	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
-		goto out_putf;
+		goto out_err;
 
 	/* and ... switch */
 	blk_mq_freeze_queue(lo->lo_queue);
@@ -719,18 +719,22 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 			     lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 	loop_update_dio(lo);
 	blk_mq_unfreeze_queue(lo->lo_queue);
-
-	fput(old_file);
 	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
 	mutex_unlock(&loop_ctl_mutex);
+	/*
+	 * We must drop file reference outside of loop_ctl_mutex as dropping
+	 * the file ref can take bd_mutex which creates circular locking
+	 * dependency.
+	 */
+	fput(old_file);
 	if (partscan)
 		loop_reread_partitions(lo, bdev);
 	return 0;
 
-out_putf:
-	fput(file);
-out_unlock:
+out_err:
 	mutex_unlock(&loop_ctl_mutex);
+	if (file)
+		fput(file);
 	return error;
 }
 

From c28445fa06a3a54e06938559b9514c5a7f01c90f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 8 Nov 2018 14:01:16 +0100
Subject: [PATCH 071/351] loop: Get rid of 'nested' acquisition of
 loop_ctl_mutex

The nested acquisition of loop_ctl_mutex (->lo_ctl_mutex back then) has
been introduced by commit f028f3b2f987e "loop: fix circular locking in
loop_clr_fd()" to fix lockdep complains about bd_mutex being acquired
after lo_ctl_mutex during partition rereading. Now that these are
properly fixed, let's stop fooling lockdep.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 112afc9bc604..bf6bc35aaf88 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -681,7 +681,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	int		error;
 	bool		partscan;
 
-	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	error = mutex_lock_killable(&loop_ctl_mutex);
 	if (error)
 		return error;
 	error = -ENXIO;
@@ -919,7 +919,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	if (!file)
 		goto out;
 
-	error = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	error = mutex_lock_killable(&loop_ctl_mutex);
 	if (error)
 		goto out_putf;
 
@@ -1135,7 +1135,7 @@ static int loop_clr_fd(struct loop_device *lo)
 {
 	int err;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	if (lo->lo_state != Lo_bound) {
@@ -1172,7 +1172,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	struct block_device *bdev;
 	bool partscan = false;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	if (lo->lo_encrypt_key_size &&
@@ -1277,7 +1277,7 @@ loop_get_status(struct loop_device *lo, struct loop_info64 *info)
 	struct kstat stat;
 	int ret;
 
-	ret = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	ret = mutex_lock_killable(&loop_ctl_mutex);
 	if (ret)
 		return ret;
 	if (lo->lo_state != Lo_bound) {
@@ -1466,7 +1466,7 @@ static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
 {
 	int err;
 
-	err = mutex_lock_killable_nested(&loop_ctl_mutex, 1);
+	err = mutex_lock_killable(&loop_ctl_mutex);
 	if (err)
 		return err;
 	switch (cmd) {

From 7baa85727d0406ffd2b2303cd803a145aa35c505 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 10:24:07 -0700
Subject: [PATCH 072/351] blk-mq-tag: change busy_iter_fn to return whether to
 continue or not

We have this functionality in sbitmap, but we don't export it in
blk-mq for users of the tags busy iteration. This can be useful
for stopping the iteration, if the caller doesn't need to find
more requests.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c            |  7 +++++--
 block/blk-mq-tag.c                |  4 ++--
 block/blk-mq.c                    | 16 +++++++++++-----
 drivers/block/mtip32xx/mtip32xx.c |  9 ++++++---
 drivers/block/nbd.c               |  3 ++-
 drivers/block/skd_main.c          |  8 +++++---
 drivers/nvme/host/core.c          |  4 ++--
 drivers/nvme/host/fc.c            |  3 ++-
 drivers/nvme/host/nvme.h          |  2 +-
 include/linux/blk-mq.h            |  4 ++--
 10 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cde19be36135..f021f4817b80 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -422,15 +422,18 @@ struct show_busy_params {
 
 /*
  * Note: the state of a request may change while this function is in progress,
- * e.g. due to a concurrent blk_mq_finish_request() call.
+ * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
+ * keep iterating requests.
  */
-static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
 {
 	const struct show_busy_params *params = data;
 
 	if (rq->mq_hctx == params->hctx)
 		__blk_mq_debugfs_rq_show(params->m,
 					 list_entry_rq(&rq->queuelist));
+
+	return true;
 }
 
 static int hctx_busy_show(void *data, struct seq_file *m)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index fb836d818b80..097e9a67d5f5 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -236,7 +236,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * test and set the bit before assigning ->rqs[].
 	 */
 	if (rq && rq->q == hctx->queue)
-		iter_data->fn(hctx, rq, iter_data->data, reserved);
+		return iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
 
@@ -289,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 */
 	rq = tags->rqs[bitnr];
 	if (rq && blk_mq_request_started(rq))
-		iter_data->fn(rq, iter_data->data, reserved);
+		return iter_data->fn(rq, iter_data->data, reserved);
 
 	return true;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 45c92b8d4795..4a622c832b31 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -94,7 +94,7 @@ struct mq_inflight {
 	unsigned int *inflight;
 };
 
-static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 				  struct request *rq, void *priv,
 				  bool reserved)
 {
@@ -109,6 +109,8 @@ static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 		mi->inflight[0]++;
 	if (mi->part->partno)
 		mi->inflight[1]++;
+
+	return true;
 }
 
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
@@ -120,7 +122,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
 }
 
-static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
 				     struct request *rq, void *priv,
 				     bool reserved)
 {
@@ -128,6 +130,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
 
 	if (rq->part == mi->part)
 		mi->inflight[rq_data_dir(rq)]++;
+
+	return true;
 }
 
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -821,7 +825,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 	return false;
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	unsigned long *next = priv;
@@ -831,7 +835,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	 * so we're not unnecessarilly synchronizing across CPUs.
 	 */
 	if (!blk_mq_req_expired(rq, next))
-		return;
+		return true;
 
 	/*
 	 * We have reason to believe the request may be expired. Take a
@@ -843,7 +847,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 	 * timeout handler to posting a natural completion.
 	 */
 	if (!refcount_inc_not_zero(&rq->ref))
-		return;
+		return true;
 
 	/*
 	 * The request is now locked and cannot be reallocated underneath the
@@ -855,6 +859,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		blk_mq_rq_timed_out(rq, reserved);
 	if (refcount_dec_and_test(&rq->ref))
 		__blk_mq_free_request(rq);
+
+	return true;
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a7daa8acbab3..947aa10107a6 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2720,7 +2720,7 @@ static void mtip_softirq_done_fn(struct request *rq)
 	blk_mq_end_request(rq, cmd->status);
 }
 
-static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_abort_cmd(struct request *req, void *data, bool reserved)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct driver_data *dd = data;
@@ -2730,14 +2730,16 @@ static void mtip_abort_cmd(struct request *req, void *data, bool reserved)
 	clear_bit(req->tag, dd->port->cmds_to_issue);
 	cmd->status = BLK_STS_IOERR;
 	mtip_softirq_done_fn(req);
+	return true;
 }
 
-static void mtip_queue_cmd(struct request *req, void *data, bool reserved)
+static bool mtip_queue_cmd(struct request *req, void *data, bool reserved)
 {
 	struct driver_data *dd = data;
 
 	set_bit(req->tag, dd->port->cmds_to_issue);
 	blk_abort_request(req);
+	return true;
 }
 
 /*
@@ -3920,12 +3922,13 @@ protocol_init_error:
 	return rv;
 }
 
-static void mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
+static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv)
 {
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(rq);
+	return true;
 }
 
 /*
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d4d6129ff66..08696f5f00bb 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -734,12 +734,13 @@ static void recv_work(struct work_struct *work)
 	kfree(args);
 }
 
-static void nbd_clear_req(struct request *req, void *data, bool reserved)
+static bool nbd_clear_req(struct request *req, void *data, bool reserved)
 {
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 
 	cmd->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
+	return true;
 }
 
 static void nbd_clear_que(struct nbd_device *nbd)
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index 2459dcc04b1c..a0196477165f 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -382,11 +382,12 @@ static void skd_log_skreq(struct skd_device *skdev,
  * READ/WRITE REQUESTS
  *****************************************************************************
  */
-static void skd_inc_in_flight(struct request *rq, void *data, bool reserved)
+static bool skd_inc_in_flight(struct request *rq, void *data, bool reserved)
 {
 	int *count = data;
 
 	count++;
+	return true;
 }
 
 static int skd_in_flight(struct skd_device *skdev)
@@ -1887,13 +1888,13 @@ static void skd_isr_fwstate(struct skd_device *skdev)
 		skd_skdev_state_to_str(skdev->state), skdev->state);
 }
 
-static void skd_recover_request(struct request *req, void *data, bool reserved)
+static bool skd_recover_request(struct request *req, void *data, bool reserved)
 {
 	struct skd_device *const skdev = data;
 	struct skd_request_context *skreq = blk_mq_rq_to_pdu(req);
 
 	if (skreq->state != SKD_REQ_STATE_BUSY)
-		return;
+		return true;
 
 	skd_log_skreq(skdev, skreq, "recover");
 
@@ -1904,6 +1905,7 @@ static void skd_recover_request(struct request *req, void *data, bool reserved)
 	skreq->state = SKD_REQ_STATE_IDLE;
 	skreq->status = BLK_STS_IOERR;
 	blk_mq_complete_request(req);
+	return true;
 }
 
 static void skd_recover_requests(struct skd_device *skdev)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2e65be8b1387..f172d63db2b5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -268,14 +268,14 @@ void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
-void nvme_cancel_request(struct request *req, void *data, bool reserved)
+bool nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
 	dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
 				"Cancelling I/O %d", req->tag);
 
 	nvme_req(req)->status = NVME_SC_ABORT_REQ;
 	blk_mq_complete_request(req);
-
+	return true;
 }
 EXPORT_SYMBOL_GPL(nvme_cancel_request);
 
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0b70c8bab045..98c3c77f48f6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2386,7 +2386,7 @@ nvme_fc_complete_rq(struct request *rq)
  * status. The done path will return the io request back to the block
  * layer with an error status.
  */
-static void
+static bool
 nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 {
 	struct nvme_ctrl *nctrl = data;
@@ -2394,6 +2394,7 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved)
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req);
 
 	__nvme_fc_abort_op(ctrl, op);
+	return true;
 }
 
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cee79cb388af..32a1f1cfdfb4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -408,7 +408,7 @@ static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl)
 }
 
 void nvme_complete_rq(struct request *req);
-void nvme_cancel_request(struct request *req, void *data, bool reserved);
+bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 		enum nvme_ctrl_state new_state);
 int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9f5e93f40857..ff497dfcbbf9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -129,9 +129,9 @@ typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *,
 typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 		unsigned int);
 
-typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
+typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
-typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
+typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
 typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);

From ae8799125d565c798e49dcab4bf182dbfc483524 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 09:03:51 -0700
Subject: [PATCH 073/351] blk-mq: provide a helper to check if a queue is busy

Returns true if the queue currently has requests pending,
false if not.

DM can use this to replace the atomic_inc/dec they do per device
to see if a device is busy.

Reviewed-by: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 26 ++++++++++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 2 files changed, 28 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4a622c832b31..4880e13e2394 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -790,6 +790,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
+static bool blk_mq_check_busy(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			      void *priv, bool reserved)
+{
+	/*
+	 * If we find a request, we know the queue is busy. Return false
+	 * to stop the iteration.
+	 */
+	if (rq->q == hctx->queue) {
+		bool *busy = priv;
+
+		*busy = true;
+		return false;
+	}
+
+	return true;
+}
+
+bool blk_mq_queue_busy(struct request_queue *q)
+{
+	bool busy = false;
+
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_busy, &busy);
+	return busy;
+}
+EXPORT_SYMBOL_GPL(blk_mq_queue_busy);
+
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	req->rq_flags |= RQF_TIMED_OUT;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ff497dfcbbf9..929e8abc5535 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -250,6 +250,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
+bool blk_mq_queue_busy(struct request_queue *q);
+
 enum {
 	/* return when out of requests */
 	BLK_MQ_REQ_NOWAIT	= (__force blk_mq_req_flags_t)(1 << 0),

From ab11fe5af1049598d1ff73014f3ea65663246a6e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 8 Nov 2018 11:09:50 -0700
Subject: [PATCH 074/351] blk-mq-tag: document tag iteration helper return
 value

Document the fact that the strategy function passed in can
control whether to continue iterating or not.

Suggested-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 097e9a67d5f5..87bc5df72d48 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -248,7 +248,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  * @fn:		Pointer to the function that will be called for each request
  *		associated with @hctx that has been assigned a driver tag.
  *		@fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
- *		where rq is a pointer to a request.
+ *		where rq is a pointer to a request. Return true to continue
+ *		iterating tags, false to stop.
  * @data:	Will be passed as third argument to @fn.
  * @reserved:	Indicates whether @bt is the breserved_tags member or the
  *		bitmap_tags member of struct blk_mq_tags.
@@ -301,7 +302,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  *		or the bitmap_tags member of struct blk_mq_tags.
  * @fn:		Pointer to the function that will be called for each started
  *		request. @fn will be called as follows: @fn(rq, @data,
- *		@reserved) where rq is a pointer to a request.
+ *		@reserved) where rq is a pointer to a request. Return true
+ *		to continue iterating tags, false to stop.
  * @data:	Will be passed as second argument to @fn.
  * @reserved:	Indicates whether @bt is the breserved_tags member or the
  *		bitmap_tags member of struct blk_mq_tags.
@@ -326,7 +328,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
  * @fn:		Pointer to the function that will be called for each started
  *		request. @fn will be called as follows: @fn(rq, @priv,
  *		reserved) where rq is a pointer to a request. 'reserved'
- *		indicates whether or not @rq is a reserved request.
+ *		indicates whether or not @rq is a reserved request. Return
+ *		true to continue iterating tags, false to stop.
  * @priv:	Will be passed as second argument to @fn.
  */
 static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
@@ -343,7 +346,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
  * @fn:		Pointer to the function that will be called for each started
  *		request. @fn will be called as follows: @fn(rq, @priv,
  *		reserved) where rq is a pointer to a request. 'reserved'
- *		indicates whether or not @rq is a reserved request.
+ *		indicates whether or not @rq is a reserved request. Return
+ *		true to continue iterating tags, false to stop.
  * @priv:	Will be passed as second argument to @fn.
  */
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,

From cd94c9ed59bad76e68d1c8910fc564351ce2162c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:51:09 +0100
Subject: [PATCH 075/351] sx8: cleanup queue and disk allocation / freeing

Make the disk/queue alloc and free helpers per-port by moving the
trivial loops into the callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sx8.c | 103 ++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 57 deletions(-)

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 064b8c5c7a32..1f371b9c0954 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1499,70 +1499,54 @@ static const struct blk_mq_ops carm_mq_ops = {
 	.queue_rq	= carm_queue_rq,
 };
 
-static int carm_init_disks(struct carm_host *host)
+static int carm_init_disk(struct carm_host *host, unsigned int port_no)
 {
-	unsigned int i;
-	int rc = 0;
+	struct carm_port *port = &host->port[port_no];
+	struct gendisk *disk;
+	struct request_queue *q;
 
-	for (i = 0; i < CARM_MAX_PORTS; i++) {
-		struct gendisk *disk;
-		struct request_queue *q;
-		struct carm_port *port;
+	port->host = host;
+	port->port_no = port_no;
 
-		port = &host->port[i];
-		port->host = host;
-		port->port_no = i;
+	disk = alloc_disk(CARM_MINORS_PER_MAJOR);
+	if (!disk)
+		return -ENOMEM;
 
-		disk = alloc_disk(CARM_MINORS_PER_MAJOR);
-		if (!disk) {
-			rc = -ENOMEM;
-			break;
-		}
+	port->disk = disk;
+	sprintf(disk->disk_name, DRV_NAME "/%u",
+		(unsigned int)host->id * CARM_MAX_PORTS + port_no);
+	disk->major = host->major;
+	disk->first_minor = port_no * CARM_MINORS_PER_MAJOR;
+	disk->fops = &carm_bd_ops;
+	disk->private_data = port;
 
-		port->disk = disk;
-		sprintf(disk->disk_name, DRV_NAME "/%u",
-			(unsigned int) (host->id * CARM_MAX_PORTS) + i);
-		disk->major = host->major;
-		disk->first_minor = i * CARM_MINORS_PER_MAJOR;
-		disk->fops = &carm_bd_ops;
-		disk->private_data = port;
+	q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
+				 max_queue, BLK_MQ_F_SHOULD_MERGE);
+	if (IS_ERR(q))
+		return PTR_ERR(q);
+	disk->queue = q;
+	blk_queue_max_segments(q, CARM_MAX_REQ_SG);
+	blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 
-		q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
-					 max_queue, BLK_MQ_F_SHOULD_MERGE);
-		if (IS_ERR(q)) {
-			rc = PTR_ERR(q);
-			break;
-		}
-		disk->queue = q;
-		blk_queue_max_segments(q, CARM_MAX_REQ_SG);
-		blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
-
-		q->queuedata = port;
-	}
-
-	return rc;
+	q->queuedata = port;
+	return 0;
 }
 
-static void carm_free_disks(struct carm_host *host)
+static void carm_free_disk(struct carm_host *host, unsigned int port_no)
 {
-	unsigned int i;
+	struct carm_port *port = &host->port[port_no];
+	struct gendisk *disk = port->disk;
 
-	for (i = 0; i < CARM_MAX_PORTS; i++) {
-		struct carm_port *port = &host->port[i];
-		struct gendisk *disk = port->disk;
+	if (!disk)
+		return;
 
-		if (disk) {
-			struct request_queue *q = disk->queue;
-
-			if (disk->flags & GENHD_FL_UP)
-				del_gendisk(disk);
-			if (q) {
-				blk_mq_free_tag_set(&port->tag_set);
-				blk_cleanup_queue(q);
-			}
-			put_disk(disk);
-		}
+	if (disk->flags & GENHD_FL_UP)
+		del_gendisk(disk);
+	if (disk->queue) {
+		blk_mq_free_tag_set(&port->tag_set);
+		blk_cleanup_queue(disk->queue);
 	}
+	put_disk(disk);
 }
 
 static int carm_init_shm(struct carm_host *host)
@@ -1667,9 +1651,11 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (host->flags & FL_DYN_MAJOR)
 		host->major = rc;
 
-	rc = carm_init_disks(host);
-	if (rc)
-		goto err_out_blkdev_disks;
+	for (i = 0; i < CARM_MAX_PORTS; i++) {
+		rc = carm_init_disk(host, i);
+		if (rc)
+			goto err_out_blkdev_disks;
+	}
 
 	pci_set_master(pdev);
 
@@ -1699,7 +1685,8 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 err_out_free_irq:
 	free_irq(pdev->irq, host);
 err_out_blkdev_disks:
-	carm_free_disks(host);
+	for (i = 0; i < CARM_MAX_PORTS; i++)
+		carm_free_disk(host, i);
 	unregister_blkdev(host->major, host->name);
 err_out_free_majors:
 	if (host->major == 160)
@@ -1724,6 +1711,7 @@ err_out:
 static void carm_remove_one (struct pci_dev *pdev)
 {
 	struct carm_host *host = pci_get_drvdata(pdev);
+	unsigned int i;
 
 	if (!host) {
 		printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
@@ -1732,7 +1720,8 @@ static void carm_remove_one (struct pci_dev *pdev)
 	}
 
 	free_irq(pdev->irq, host);
-	carm_free_disks(host);
+	for (i = 0; i < CARM_MAX_PORTS; i++)
+		carm_free_disk(host, i);
 	unregister_blkdev(host->major, host->name);
 	if (host->major == 160)
 		clear_bit(0, &carm_major_alloc);

From 72d7ce8eb2bca8f77eebdd3271e59f6a5998dc4a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:51:10 +0100
Subject: [PATCH 076/351] sx8: use a per-host tag_set

The current sx8 code spends a lot of effort dealing with the fact that
tags are per-host, but there might be multiple queues.  Now that the
driver has been converted to blk-mq it can take care of the blk-mq
tag_set concept that has been designed just for that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sx8.c | 343 ++++++++++++--------------------------------
 1 file changed, 95 insertions(+), 248 deletions(-)

diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 1f371b9c0954..4478eb7efee0 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -243,7 +243,6 @@ struct carm_port {
 	unsigned int			port_no;
 	struct gendisk			*disk;
 	struct carm_host		*host;
-	struct blk_mq_tag_set		tag_set;
 
 	/* attached device characteristics */
 	u64				capacity;
@@ -254,13 +253,10 @@ struct carm_port {
 };
 
 struct carm_request {
-	unsigned int			tag;
 	int				n_elem;
 	unsigned int			msg_type;
 	unsigned int			msg_subtype;
 	unsigned int			msg_bucket;
-	struct request			*rq;
-	struct carm_port		*port;
 	struct scatterlist		sg[CARM_MAX_REQ_SG];
 };
 
@@ -291,9 +287,6 @@ struct carm_host {
 	unsigned int			wait_q_cons;
 	struct request_queue		*wait_q[CARM_MAX_WAIT_Q];
 
-	unsigned int			n_msgs;
-	u64				msg_alloc;
-	struct carm_request		req[CARM_MAX_REQ];
 	void				*msg_base;
 	dma_addr_t			msg_dma;
 
@@ -478,10 +471,10 @@ static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
 }
 
 static int carm_send_msg(struct carm_host *host,
-			 struct carm_request *crq)
+			 struct carm_request *crq, unsigned tag)
 {
 	void __iomem *mmio = host->mmio;
-	u32 msg = (u32) carm_ref_msg_dma(host, crq->tag);
+	u32 msg = (u32) carm_ref_msg_dma(host, tag);
 	u32 cm_bucket = crq->msg_bucket;
 	u32 tmp;
 	int rc = 0;
@@ -506,99 +499,24 @@ static int carm_send_msg(struct carm_host *host,
 	return rc;
 }
 
-static struct carm_request *carm_get_request(struct carm_host *host)
-{
-	unsigned int i;
-
-	/* obey global hardware limit on S/G entries */
-	if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG))
-		return NULL;
-
-	for (i = 0; i < max_queue; i++)
-		if ((host->msg_alloc & (1ULL << i)) == 0) {
-			struct carm_request *crq = &host->req[i];
-			crq->port = NULL;
-			crq->n_elem = 0;
-
-			host->msg_alloc |= (1ULL << i);
-			host->n_msgs++;
-
-			assert(host->n_msgs <= CARM_MAX_REQ);
-			sg_init_table(crq->sg, CARM_MAX_REQ_SG);
-			return crq;
-		}
-
-	DPRINTK("no request available, returning NULL\n");
-	return NULL;
-}
-
-static int carm_put_request(struct carm_host *host, struct carm_request *crq)
-{
-	assert(crq->tag < max_queue);
-
-	if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0))
-		return -EINVAL; /* tried to clear a tag that was not active */
-
-	assert(host->hw_sg_used >= crq->n_elem);
-
-	host->msg_alloc &= ~(1ULL << crq->tag);
-	host->hw_sg_used -= crq->n_elem;
-	host->n_msgs--;
-
-	return 0;
-}
-
-static struct carm_request *carm_get_special(struct carm_host *host)
-{
-	unsigned long flags;
-	struct carm_request *crq = NULL;
-	struct request *rq;
-	int tries = 5000;
-
-	while (tries-- > 0) {
-		spin_lock_irqsave(&host->lock, flags);
-		crq = carm_get_request(host);
-		spin_unlock_irqrestore(&host->lock, flags);
-
-		if (crq)
-			break;
-		msleep(10);
-	}
-
-	if (!crq)
-		return NULL;
-
-	rq = blk_get_request(host->oob_q, REQ_OP_DRV_OUT, 0);
-	if (IS_ERR(rq)) {
-		spin_lock_irqsave(&host->lock, flags);
-		carm_put_request(host, crq);
-		spin_unlock_irqrestore(&host->lock, flags);
-		return NULL;
-	}
-
-	crq->rq = rq;
-	return crq;
-}
-
 static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 {
 	struct carm_msg_ioctl *ioc;
-	unsigned int idx;
 	u32 msg_data;
 	dma_addr_t msg_dma;
 	struct carm_request *crq;
+	struct request *rq;
 	int rc;
 
-	crq = carm_get_special(host);
-	if (!crq) {
+	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+	if (IS_ERR(rq)) {
 		rc = -ENOMEM;
 		goto err_out;
 	}
+	crq = blk_mq_rq_to_pdu(rq);
 
-	idx = crq->tag;
-
-	ioc = carm_ref_msg(host, idx);
-	msg_dma = carm_ref_msg_dma(host, idx);
+	ioc = carm_ref_msg(host, rq->tag);
+	msg_dma = carm_ref_msg_dma(host, rq->tag);
 	msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
 
 	crq->msg_type = CARM_MSG_ARRAY;
@@ -612,7 +530,7 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	ioc->type	= CARM_MSG_ARRAY;
 	ioc->subtype	= CARM_ARRAY_INFO;
 	ioc->array_id	= (u8) array_idx;
-	ioc->handle	= cpu_to_le32(TAG_ENCODE(idx));
+	ioc->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
 	ioc->data_addr	= cpu_to_le32(msg_data);
 
 	spin_lock_irq(&host->lock);
@@ -620,9 +538,8 @@ static int carm_array_info (struct carm_host *host, unsigned int array_idx)
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
 
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->special = crq;
-	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 
@@ -637,21 +554,21 @@ typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
 
 static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 {
+	struct request *rq;
 	struct carm_request *crq;
 	struct carm_msg_ioctl *ioc;
 	void *mem;
-	unsigned int idx, msg_size;
+	unsigned int msg_size;
 	int rc;
 
-	crq = carm_get_special(host);
-	if (!crq)
+	rq = blk_mq_alloc_request(host->oob_q, REQ_OP_DRV_OUT, 0);
+	if (IS_ERR(rq))
 		return -ENOMEM;
+	crq = blk_mq_rq_to_pdu(rq);
 
-	idx = crq->tag;
+	mem = carm_ref_msg(host, rq->tag);
 
-	mem = carm_ref_msg(host, idx);
-
-	msg_size = func(host, idx, mem);
+	msg_size = func(host, rq->tag, mem);
 
 	ioc = mem;
 	crq->msg_type = ioc->type;
@@ -660,9 +577,8 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func)
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
 
-	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
-	crq->rq->special = crq;
-	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", rq->tag);
+	blk_execute_rq_nowait(host->oob_q, NULL, rq, true, NULL);
 
 	return 0;
 }
@@ -744,19 +660,6 @@ static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
 	       sizeof(struct carm_fw_ver);
 }
 
-static inline void carm_end_request_queued(struct carm_host *host,
-					   struct carm_request *crq,
-					   blk_status_t error)
-{
-	struct request *req = crq->rq;
-	int rc;
-
-	blk_mq_end_request(req, error);
-
-	rc = carm_put_request(host, crq);
-	assert(rc == 0);
-}
-
 static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
 {
 	unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
@@ -791,101 +694,50 @@ static inline void carm_round_robin(struct carm_host *host)
 	}
 }
 
-static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq,
-			       blk_status_t error)
+static inline enum dma_data_direction carm_rq_dir(struct request *rq)
 {
-	carm_end_request_queued(host, crq, error);
-	if (max_queue == 1)
-		carm_round_robin(host);
-	else if ((host->n_msgs <= CARM_MSG_LOW_WATER) &&
-		 (host->hw_sg_used <= CARM_SG_LOW_WATER)) {
-		carm_round_robin(host);
-	}
-}
-
-static blk_status_t carm_oob_queue_rq(struct blk_mq_hw_ctx *hctx,
-				      const struct blk_mq_queue_data *bd)
-{
-	struct request_queue *q = hctx->queue;
-	struct carm_host *host = q->queuedata;
-	struct carm_request *crq;
-	int rc;
-
-	blk_mq_start_request(bd->rq);
-
-	spin_lock_irq(&host->lock);
-
-	crq = bd->rq->special;
-	assert(crq != NULL);
-	assert(crq->rq == bd->rq);
-
-	crq->n_elem = 0;
-
-	DPRINTK("send req\n");
-	rc = carm_send_msg(host, crq);
-	if (rc) {
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
-	}
-
-	spin_unlock_irq(&host->lock);
-	return BLK_STS_OK;
+	return op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
 }
 
 static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 				  const struct blk_mq_queue_data *bd)
 {
 	struct request_queue *q = hctx->queue;
+	struct request *rq = bd->rq;
 	struct carm_port *port = q->queuedata;
 	struct carm_host *host = port->host;
+	struct carm_request *crq = blk_mq_rq_to_pdu(rq);
 	struct carm_msg_rw *msg;
-	struct carm_request *crq;
-	struct request *rq = bd->rq;
 	struct scatterlist *sg;
-	int writing = 0, pci_dir, i, n_elem, rc;
-	u32 tmp;
+	int i, n_elem = 0, rc;
 	unsigned int msg_size;
+	u32 tmp;
+
+	crq->n_elem = 0;
+	sg_init_table(crq->sg, CARM_MAX_REQ_SG);
 
 	blk_mq_start_request(rq);
 
 	spin_lock_irq(&host->lock);
-
-	crq = carm_get_request(host);
-	if (!crq) {
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
-	}
-	crq->rq = rq;
-
-	if (rq_data_dir(rq) == WRITE) {
-		writing = 1;
-		pci_dir = DMA_TO_DEVICE;
-	} else {
-		pci_dir = DMA_FROM_DEVICE;
-	}
+	if (req_op(rq) == REQ_OP_DRV_OUT)
+		goto send_msg;
 
 	/* get scatterlist from block layer */
 	sg = &crq->sg[0];
 	n_elem = blk_rq_map_sg(q, rq, sg);
-	if (n_elem <= 0) {
-		/* request with no s/g entries? */
-		carm_end_rq(host, crq, BLK_STS_IOERR);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_IOERR;
-	}
+	if (n_elem <= 0)
+		goto out_ioerr;
 
 	/* map scatterlist to PCI bus addresses */
-	n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, pci_dir);
-	if (n_elem <= 0) {
-		/* request with no s/g entries? */
-		carm_end_rq(host, crq, BLK_STS_IOERR);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_IOERR;
-	}
+	n_elem = dma_map_sg(&host->pdev->dev, sg, n_elem, carm_rq_dir(rq));
+	if (n_elem <= 0)
+		goto out_ioerr;
+
+	/* obey global hardware limit on S/G entries */
+	if (host->hw_sg_used >= CARM_MAX_HOST_SG - n_elem)
+		goto out_resource;
+
 	crq->n_elem = n_elem;
-	crq->port = port;
 	host->hw_sg_used += n_elem;
 
 	/*
@@ -893,9 +745,9 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 */
 
 	VPRINTK("build msg\n");
-	msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag);
+	msg = (struct carm_msg_rw *) carm_ref_msg(host, rq->tag);
 
-	if (writing) {
+	if (rq_data_dir(rq) == WRITE) {
 		msg->type = CARM_MSG_WRITE;
 		crq->msg_type = CARM_MSG_WRITE;
 	} else {
@@ -906,7 +758,7 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	msg->id		= port->port_no;
 	msg->sg_count	= n_elem;
 	msg->sg_type	= SGT_32BIT;
-	msg->handle	= cpu_to_le32(TAG_ENCODE(crq->tag));
+	msg->handle	= cpu_to_le32(TAG_ENCODE(rq->tag));
 	msg->lba	= cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
 	tmp		= (blk_rq_pos(rq) >> 16) >> 16;
 	msg->lba_high	= cpu_to_le16( (u16) tmp );
@@ -923,22 +775,28 @@ static blk_status_t carm_queue_rq(struct blk_mq_hw_ctx *hctx,
 	rc = carm_lookup_bucket(msg_size);
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
-
+send_msg:
 	/*
 	 * queue read/write message to hardware
 	 */
-
-	VPRINTK("send msg, tag == %u\n", crq->tag);
-	rc = carm_send_msg(host, crq);
+	VPRINTK("send msg, tag == %u\n", rq->tag);
+	rc = carm_send_msg(host, crq, rq->tag);
 	if (rc) {
-		carm_put_request(host, crq);
-		carm_push_q(host, q);
-		spin_unlock_irq(&host->lock);
-		return BLK_STS_DEV_RESOURCE;
+		host->hw_sg_used -= n_elem;
+		goto out_resource;
 	}
 
 	spin_unlock_irq(&host->lock);
 	return BLK_STS_OK;
+out_resource:
+	dma_unmap_sg(&host->pdev->dev, &crq->sg[0], n_elem, carm_rq_dir(rq));
+	carm_push_q(host, q);
+	spin_unlock_irq(&host->lock);
+	return BLK_STS_DEV_RESOURCE;
+out_ioerr:
+	carm_round_robin(host);
+	spin_unlock_irq(&host->lock);
+	return BLK_STS_IOERR;
 }
 
 static void carm_handle_array_info(struct carm_host *host,
@@ -954,8 +812,6 @@ static void carm_handle_array_info(struct carm_host *host,
 
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	if (error)
 		goto out;
 	if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
@@ -1011,8 +867,6 @@ static void carm_handle_scan_chan(struct carm_host *host,
 
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	if (error) {
 		new_state = HST_ERROR;
 		goto out;
@@ -1040,8 +894,6 @@ static void carm_handle_generic(struct carm_host *host,
 {
 	DPRINTK("ENTER\n");
 
-	carm_end_rq(host, crq, error);
-
 	assert(host->state == cur_state);
 	if (error)
 		host->state = HST_ERROR;
@@ -1050,28 +902,12 @@ static void carm_handle_generic(struct carm_host *host,
 	schedule_work(&host->fsm_task);
 }
 
-static inline void carm_handle_rw(struct carm_host *host,
-				  struct carm_request *crq, blk_status_t error)
-{
-	int pci_dir;
-
-	VPRINTK("ENTER\n");
-
-	if (rq_data_dir(crq->rq) == WRITE)
-		pci_dir = DMA_TO_DEVICE;
-	else
-		pci_dir = DMA_FROM_DEVICE;
-
-	dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem, pci_dir);
-
-	carm_end_rq(host, crq, error);
-}
-
 static inline void carm_handle_resp(struct carm_host *host,
 				    __le32 ret_handle_le, u32 status)
 {
 	u32 handle = le32_to_cpu(ret_handle_le);
 	unsigned int msg_idx;
+	struct request *rq;
 	struct carm_request *crq;
 	blk_status_t error = (status == RMSG_OK) ? 0 : BLK_STS_IOERR;
 	u8 *mem;
@@ -1087,13 +923,15 @@ static inline void carm_handle_resp(struct carm_host *host,
 	msg_idx = TAG_DECODE(handle);
 	VPRINTK("tag == %u\n", msg_idx);
 
-	crq = &host->req[msg_idx];
+	rq = blk_mq_tag_to_rq(host->tag_set.tags[0], msg_idx);
+	crq = blk_mq_rq_to_pdu(rq);
 
 	/* fast path */
 	if (likely(crq->msg_type == CARM_MSG_READ ||
 		   crq->msg_type == CARM_MSG_WRITE)) {
-		carm_handle_rw(host, crq, error);
-		return;
+		dma_unmap_sg(&host->pdev->dev, &crq->sg[0], crq->n_elem,
+			     carm_rq_dir(rq));
+		goto done;
 	}
 
 	mem = carm_ref_msg(host, msg_idx);
@@ -1103,7 +941,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 		switch (crq->msg_subtype) {
 		case CARM_IOC_SCAN_CHAN:
 			carm_handle_scan_chan(host, crq, mem, error);
-			break;
+			goto done;
 		default:
 			/* unknown / invalid response */
 			goto err_out;
@@ -1116,11 +954,11 @@ static inline void carm_handle_resp(struct carm_host *host,
 		case MISC_ALLOC_MEM:
 			carm_handle_generic(host, crq, error,
 					    HST_ALLOC_BUF, HST_SYNC_TIME);
-			break;
+			goto done;
 		case MISC_SET_TIME:
 			carm_handle_generic(host, crq, error,
 					    HST_SYNC_TIME, HST_GET_FW_VER);
-			break;
+			goto done;
 		case MISC_GET_FW_VER: {
 			struct carm_fw_ver *ver = (struct carm_fw_ver *)
 				(mem + sizeof(struct carm_msg_get_fw_ver));
@@ -1130,7 +968,7 @@ static inline void carm_handle_resp(struct carm_host *host,
 			}
 			carm_handle_generic(host, crq, error,
 					    HST_GET_FW_VER, HST_PORT_SCAN);
-			break;
+			goto done;
 		}
 		default:
 			/* unknown / invalid response */
@@ -1161,7 +999,13 @@ static inline void carm_handle_resp(struct carm_host *host,
 err_out:
 	printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
 	       pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
-	carm_end_rq(host, crq, BLK_STS_IOERR);
+	error = BLK_STS_IOERR;
+done:
+	host->hw_sg_used -= crq->n_elem;
+	blk_mq_end_request(blk_mq_rq_from_pdu(crq), error);
+
+	if (host->hw_sg_used <= CARM_SG_LOW_WATER)
+		carm_round_robin(host);
 }
 
 static inline void carm_handle_responses(struct carm_host *host)
@@ -1491,10 +1335,6 @@ static int carm_init_host(struct carm_host *host)
 	return 0;
 }
 
-static const struct blk_mq_ops carm_oob_mq_ops = {
-	.queue_rq	= carm_oob_queue_rq,
-};
-
 static const struct blk_mq_ops carm_mq_ops = {
 	.queue_rq	= carm_queue_rq,
 };
@@ -1520,15 +1360,15 @@ static int carm_init_disk(struct carm_host *host, unsigned int port_no)
 	disk->fops = &carm_bd_ops;
 	disk->private_data = port;
 
-	q = blk_mq_init_sq_queue(&port->tag_set, &carm_mq_ops,
-				 max_queue, BLK_MQ_F_SHOULD_MERGE);
+	q = blk_mq_init_queue(&host->tag_set);
 	if (IS_ERR(q))
 		return PTR_ERR(q);
-	disk->queue = q;
+
 	blk_queue_max_segments(q, CARM_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
 
 	q->queuedata = port;
+	disk->queue = q;
 	return 0;
 }
 
@@ -1542,10 +1382,8 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no)
 
 	if (disk->flags & GENHD_FL_UP)
 		del_gendisk(disk);
-	if (disk->queue) {
-		blk_mq_free_tag_set(&port->tag_set);
+	if (disk->queue)
 		blk_cleanup_queue(disk->queue);
-	}
 	put_disk(disk);
 }
 
@@ -1602,9 +1440,6 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	INIT_WORK(&host->fsm_task, carm_fsm_task);
 	init_completion(&host->probe_comp);
 
-	for (i = 0; i < ARRAY_SIZE(host->req); i++)
-		host->req[i].tag = i;
-
 	host->mmio = ioremap(pci_resource_start(pdev, 0),
 			     pci_resource_len(pdev, 0));
 	if (!host->mmio) {
@@ -1621,14 +1456,26 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 		goto err_out_iounmap;
 	}
 
-	q = blk_mq_init_sq_queue(&host->tag_set, &carm_oob_mq_ops, 1,
-					BLK_MQ_F_NO_SCHED);
+	memset(&host->tag_set, 0, sizeof(host->tag_set));
+	host->tag_set.ops = &carm_mq_ops;
+	host->tag_set.cmd_size = sizeof(struct carm_request);
+	host->tag_set.nr_hw_queues = 1;
+	host->tag_set.nr_maps = 1;
+	host->tag_set.queue_depth = max_queue;
+	host->tag_set.numa_node = NUMA_NO_NODE;
+	host->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+
+	rc = blk_mq_alloc_tag_set(&host->tag_set);
+	if (rc)
+		goto err_out_dma_free;
+
+	q = blk_mq_init_queue(&host->tag_set);
 	if (IS_ERR(q)) {
-		printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n",
-		       pci_name(pdev));
 		rc = PTR_ERR(q);
+		blk_mq_free_tag_set(&host->tag_set);
 		goto err_out_dma_free;
 	}
+
 	host->oob_q = q;
 	q->queuedata = host;
 

From b5fa0e9ec997031083a76972d2ee0e8db671420a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:54 +0100
Subject: [PATCH 077/351] mtip32xx: move the blk_rq_map_sg call to
 mtip_hw_submit_io

We have all arguments at hand in mtip_hw_submit_io, so keep the
rq to sg mapping close to the dma_map_sg call.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 947aa10107a6..1964af8a187b 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2171,7 +2171,6 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  * @dd       Pointer to the driver data structure.
  * @start    First sector to read.
  * @nsect    Number of sectors to read.
- * @nents    Number of entries in scatter list for the read command.
  * @tag      The tag of this read command.
  * @callback Pointer to the function that should be called
  *	     when the read completes.
@@ -2183,7 +2182,7 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
  *	None
  */
 static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
-			      struct mtip_cmd *command, int nents,
+			      struct mtip_cmd *command,
 			      struct blk_mq_hw_ctx *hctx)
 {
 	struct host_to_dev_fis	*fis;
@@ -2191,8 +2190,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 	u64 start = blk_rq_pos(rq);
 	unsigned int nsect = blk_rq_sectors(rq);
+	unsigned int nents;
 
 	/* Map the scatter list for DMA access */
+	nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
 	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
 
 	prefetch(&port->flags);
@@ -3548,7 +3549,6 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct driver_data *dd = hctx->queue->queuedata;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-	unsigned int nents;
 
 	if (is_se_active(dd))
 		return -ENODATA;
@@ -3579,11 +3579,8 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		return 0;
 	}
 
-	/* Create the scatter list for this request. */
-	nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
-
 	/* Issue the read/write. */
-	mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
+	mtip_hw_submit_io(dd, rq, cmd, hctx);
 	return 0;
 }
 

From 10966fa13855c5c64444bb01a97b86158e63267c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:55 +0100
Subject: [PATCH 078/351] mtip32xx: merge mtip_submit_request into
 mtip_queue_rq

Factor out a new is_stopped helper that matches the existing
is_se_active helper, and merge the trivial amount of remaining code
into the only caller.  This also allows better error handling by
returning a BLK_STS_* directly instead of explicitly calling
blk_mq_end_request, and moving blk_mq_start_request closer to the
actual issue to hardware.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 78 +++++++++++--------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 1964af8a187b..75a4ca0210f2 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3534,54 +3534,24 @@ static inline bool is_se_active(struct driver_data *dd)
 	return false;
 }
 
-/*
- * Block layer make request function.
- *
- * This function is called by the kernel to process a BIO for
- * the P320 device.
- *
- * @queue Pointer to the request queue. Unused other than to obtain
- *              the driver data structure.
- * @rq    Pointer to the request.
- *
- */
-static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static inline bool is_stopped(struct driver_data *dd, struct request *rq)
 {
-	struct driver_data *dd = hctx->queue->queuedata;
-	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO)))
+		return false;
 
-	if (is_se_active(dd))
-		return -ENODATA;
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) &&
+	    rq_data_dir(rq))
+		return true;
+	if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+		return true;
 
-	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
-		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
-							&dd->dd_flag))) {
-			return -ENXIO;
-		}
-		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
-			return -ENODATA;
-		}
-		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
-							&dd->dd_flag) &&
-				rq_data_dir(rq))) {
-			return -ENODATA;
-		}
-		if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag) ||
-			test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)))
-			return -ENODATA;
-	}
-
-	if (req_op(rq) == REQ_OP_DISCARD) {
-		int err;
-
-		err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
-		blk_mq_end_request(rq, err ? BLK_STS_IOERR : BLK_STS_OK);
-		return 0;
-	}
-
-	/* Issue the read/write. */
-	mtip_hw_submit_io(dd, rq, cmd, hctx);
-	return 0;
+	return false;
 }
 
 static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
@@ -3647,8 +3617,9 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
+	struct driver_data *dd = hctx->queue->queuedata;
 	struct request *rq = bd->rq;
-	int ret;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	mtip_init_cmd_header(rq);
 
@@ -3658,12 +3629,19 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
 		return BLK_STS_RESOURCE;
 
+	if (is_se_active(dd) || is_stopped(dd, rq))
+		return BLK_STS_IOERR;
+
 	blk_mq_start_request(rq);
 
-	ret = mtip_submit_request(hctx, rq);
-	if (likely(!ret))
-		return BLK_STS_OK;
-	return BLK_STS_IOERR;
+	if (req_op(rq) == REQ_OP_DISCARD) {
+		if (mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)) < 0)
+			return BLK_STS_IOERR;
+	} else {
+		mtip_hw_submit_io(dd, rq, cmd, hctx);
+	}
+
+	return BLK_STS_OK;
 }
 
 static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,

From 81e66174ab0aad0bfb3d69abb334b0402fb25df4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:56 +0100
Subject: [PATCH 079/351] mtip32xx: return a blk_status_t from mtip_send_trim

This allows for better error propagation and simpler code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 75a4ca0210f2..d5277bba0c02 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1423,23 +1423,19 @@ static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
  * @dd		pointer to driver_data structure
  * @lba		starting lba
  * @len		# of 512b sectors to trim
- *
- * return value
- *      -ENOMEM		Out of dma memory
- *      -EINVAL		Invalid parameters passed in, trim not supported
- *      -EIO		Error submitting trim request to hw
  */
-static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
-				unsigned int len)
+static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
+		unsigned int len)
 {
-	int i, rv = 0;
 	u64 tlba, tlen, sect_left;
 	struct mtip_trim_entry *buf;
 	dma_addr_t dma_addr;
 	struct host_to_dev_fis fis;
+	blk_status_t ret = BLK_STS_OK;
+	int i;
 
 	if (!len || dd->trim_supp == false)
-		return -EINVAL;
+		return BLK_STS_IOERR;
 
 	/* Trim request too big */
 	WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
@@ -1454,7 +1450,7 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
 	buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
 								GFP_KERNEL);
 	if (!buf)
-		return -ENOMEM;
+		return BLK_STS_RESOURCE;
 	memset(buf, 0, ATA_SECT_SIZE);
 
 	for (i = 0, sect_left = len, tlba = lba;
@@ -1486,10 +1482,10 @@ static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
 					ATA_SECT_SIZE,
 					0,
 					MTIP_TRIM_TIMEOUT_MS) < 0)
-		rv = -EIO;
+		ret = BLK_STS_IOERR;
 
 	dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
-	return rv;
+	return ret;
 }
 
 /*
@@ -3634,13 +3630,9 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	blk_mq_start_request(rq);
 
-	if (req_op(rq) == REQ_OP_DISCARD) {
-		if (mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)) < 0)
-			return BLK_STS_IOERR;
-	} else {
-		mtip_hw_submit_io(dd, rq, cmd, hctx);
-	}
-
+	if (req_op(rq) == REQ_OP_DISCARD)
+		return mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+	mtip_hw_submit_io(dd, rq, cmd, hctx);
 	return BLK_STS_OK;
 }
 

From 449a15d9e49a566a38481e51a21360c7d91a9c77 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:57 +0100
Subject: [PATCH 080/351] mtip32xx: remove __force_bit2int

There is no good excuse not to use proper __le16/32 types.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 33 ++++++++++++-------------------
 drivers/block/mtip32xx/mtip32xx.h | 28 ++++++++++++--------------
 2 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index d5277bba0c02..9245c325fe3c 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -181,9 +181,9 @@ static void mtip_init_cmd_header(struct request *rq)
 				(sizeof(struct mtip_cmd_hdr) * rq->tag);
 
 	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
-		cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+		cmd->command_header->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
 
-	cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+	cmd->command_header->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
 }
 
 static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
@@ -1459,8 +1459,8 @@ static blk_status_t mtip_send_trim(struct driver_data *dd, unsigned int lba,
 		tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
 					MTIP_MAX_TRIM_ENTRY_LEN :
 					sect_left);
-		buf[i].lba = __force_bit2int cpu_to_le32(tlba);
-		buf[i].range = __force_bit2int cpu_to_le16(tlen);
+		buf[i].lba = cpu_to_le32(tlba);
+		buf[i].range = cpu_to_le16(tlen);
 		tlba += tlen;
 		sect_left -= tlen;
 	}
@@ -1590,11 +1590,9 @@ static inline void fill_command_sg(struct driver_data *dd,
 		if (dma_len > 0x400000)
 			dev_err(&dd->pdev->dev,
 				"DMA segment length truncated\n");
-		command_sg->info = __force_bit2int
-			cpu_to_le32((dma_len-1) & 0x3FFFFF);
-		command_sg->dba	= __force_bit2int
-			cpu_to_le32(sg_dma_address(sg));
-		command_sg->dba_upper = __force_bit2int
+		command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	=  cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper =
 			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
 		command_sg++;
 		sg++;
@@ -2231,8 +2229,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 
 	/* Populate the command header */
 	command->command_header->opts =
-			__force_bit2int cpu_to_le32(
-				(nents << 16) | 5 | AHCI_CMD_PREFETCH);
+			cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
 	command->command_header->byte_count = 0;
 
 	command->direction = dma_dir;
@@ -3586,20 +3583,16 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		return BLK_STS_RESOURCE;
 
 	/* Populate the SG list */
-	cmd->command_header->opts =
-		 __force_bit2int cpu_to_le32(icmd->opts | icmd->fis_len);
+	cmd->command_header->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
 	if (icmd->buf_len) {
 		command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
 
-		command_sg->info =
-			__force_bit2int cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
-		command_sg->dba	=
-			__force_bit2int cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
+		command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
+		command_sg->dba	= cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
 		command_sg->dba_upper =
-			__force_bit2int cpu_to_le32((icmd->buffer >> 16) >> 16);
+			cpu_to_le32((icmd->buffer >> 16) >> 16);
 
-		cmd->command_header->opts |=
-			__force_bit2int cpu_to_le32((1 << 16));
+		cmd->command_header->opts |= cpu_to_le32((1 << 16));
 	}
 
 	/* Populate the command header */
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e20e55dab443..0aa1ea210822 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -126,8 +126,6 @@
 
 #define MTIP_DFS_MAX_BUF_SIZE 1024
 
-#define __force_bit2int (unsigned int __force)
-
 enum {
 	/* below are bit numbers in 'flags' defined in mtip_port */
 	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
@@ -200,9 +198,9 @@ struct mtip_work {
 #define MTIP_MAX_TRIM_ENTRY_LEN		0xfff8
 
 struct mtip_trim_entry {
-	u32 lba;   /* starting lba of region */
-	u16 rsvd;  /* unused */
-	u16 range; /* # of 512b blocks to trim */
+	__le32 lba;   /* starting lba of region */
+	__le16 rsvd;  /* unused */
+	__le16 range; /* # of 512b blocks to trim */
 } __packed;
 
 struct mtip_trim {
@@ -278,24 +276,24 @@ struct mtip_cmd_hdr {
 	 * - Bit 5 Unused in this implementation.
 	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
 	 */
-	unsigned int opts;
+	__le32 opts;
 	/* This field is unsed when using NCQ. */
 	union {
-		unsigned int byte_count;
-		unsigned int status;
+		__le32 byte_count;
+		__le32 status;
 	};
 	/*
 	 * Lower 32 bits of the command table address associated with this
 	 * header. The command table addresses must be 128 byte aligned.
 	 */
-	unsigned int ctba;
+	__le32 ctba;
 	/*
 	 * If 64 bit addressing is used this field is the upper 32 bits
 	 * of the command table address associated with this command.
 	 */
-	unsigned int ctbau;
+	__le32 ctbau;
 	/* Reserved and unused. */
-	unsigned int res[4];
+	u32 res[4];
 };
 
 /* Command scatter gather structure (PRD). */
@@ -305,21 +303,21 @@ struct mtip_cmd_sg {
 	 * address must be 8 byte aligned signified by bits 2:0 being
 	 * set to 0.
 	 */
-	unsigned int dba;
+	__le32 dba;
 	/*
 	 * When 64 bit addressing is used this field is the upper
 	 * 32 bits of the data buffer address.
 	 */
-	unsigned int dba_upper;
+	__le32 dba_upper;
 	/* Unused. */
-	unsigned int reserved;
+	__le32 reserved;
 	/*
 	 * Bit 31: interrupt when this data block has been transferred.
 	 * Bits 30..22: reserved
 	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
 	 * 8 byte aligned signified by bits 2:0 being set to 1.
 	 */
-	unsigned int info;
+	__le32 info;
 };
 struct mtip_port;
 

From 643b5f68d0f9b5a269ebbcc9f0e6c658b41b864e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:58 +0100
Subject: [PATCH 081/351] mtip32xx: add missing endianess annotations on struct
 smart_attr

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 0aa1ea210822..e8b4b3d5365a 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -172,10 +172,10 @@ enum {
 
 struct smart_attr {
 	u8 attr_id;
-	u16 flags;
+	__le16 flags;
 	u8 cur;
 	u8 worst;
-	u32 data;
+	__le32 data;
 	u8 res[3];
 } __packed;
 

From 7bbf118f3b15682f79eee52c3eb96a13a71372c0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:48:59 +0100
Subject: [PATCH 082/351] mtip32xx: remove mtip_init_cmd_header

There isn't much need for this helper - we can just calculate the offset
for the command header once late in the submission path and fill out
the ctba and ctbau fields there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 44 +++++++++++--------------------
 drivers/block/mtip32xx/mtip32xx.h |  5 ----
 2 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 9245c325fe3c..6006f7baa1eb 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,24 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	return false; /* device present */
 }
 
-/* we have to use runtime tag to setup command header */
-static void mtip_init_cmd_header(struct request *rq)
-{
-	struct driver_data *dd = rq->q->queuedata;
-	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
-
-	/* Point the command headers at the command tables. */
-	cmd->command_header = dd->port->command_list +
-				(sizeof(struct mtip_cmd_hdr) * rq->tag);
-	cmd->command_header_dma = dd->port->command_list_dma +
-				(sizeof(struct mtip_cmd_hdr) * rq->tag);
-
-	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
-		cmd->command_header->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
-
-	cmd->command_header->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
-}
-
 static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
 	struct request *rq;
@@ -197,9 +179,6 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 	if (IS_ERR(rq))
 		return NULL;
 
-	/* Internal cmd isn't submitted via .queue_rq */
-	mtip_init_cmd_header(rq);
-
 	return blk_mq_rq_to_pdu(rq);
 }
 
@@ -2179,6 +2158,8 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 			      struct mtip_cmd *command,
 			      struct blk_mq_hw_ctx *hctx)
 {
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct host_to_dev_fis	*fis;
 	struct mtip_port *port = dd->port;
 	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
@@ -2228,9 +2209,11 @@ static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
 		fis->device |= 1 << 7;
 
 	/* Populate the command header */
-	command->command_header->opts =
-			cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
-	command->command_header->byte_count = 0;
+	hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16);
+	hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	hdr->byte_count = 0;
 
 	command->direction = dma_dir;
 
@@ -3577,13 +3560,18 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 	struct driver_data *dd = hctx->queue->queuedata;
 	struct mtip_int_cmd *icmd = rq->special;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct mtip_cmd_sg *command_sg;
 
 	if (mtip_commands_active(dd->port))
 		return BLK_STS_RESOURCE;
 
+	hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
 	/* Populate the SG list */
-	cmd->command_header->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
+	hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
 	if (icmd->buf_len) {
 		command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
 
@@ -3592,11 +3580,11 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		command_sg->dba_upper =
 			cpu_to_le32((icmd->buffer >> 16) >> 16);
 
-		cmd->command_header->opts |= cpu_to_le32((1 << 16));
+		hdr->opts |= cpu_to_le32((1 << 16));
 	}
 
 	/* Populate the command header */
-	cmd->command_header->byte_count = 0;
+	hdr->byte_count = 0;
 
 	blk_mq_start_request(rq);
 	mtip_issue_non_ncq_command(dd->port, rq->tag);
@@ -3610,8 +3598,6 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request *rq = bd->rq;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
-	mtip_init_cmd_header(rq);
-
 	if (blk_rq_is_passthrough(rq))
 		return mtip_issue_reserved_cmd(hctx, rq);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e8b4b3d5365a..63414928f07c 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -323,11 +323,6 @@ struct mtip_port;
 
 /* Structure used to describe a command. */
 struct mtip_cmd {
-
-	struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
-
-	dma_addr_t command_header_dma; /* corresponding physical address */
-
 	void *command; /* ptr to command table entry */
 
 	dma_addr_t command_dma; /* corresponding physical address */

From 55c7bc37e05b5f7593b76d1c74e254b996b73d1a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:00 +0100
Subject: [PATCH 083/351] mtip32xx: remove mtip_get_int_command

Merging this function into the only callers makes the code flow easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 6006f7baa1eb..5ae86224b2f5 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -168,20 +168,6 @@ static bool mtip_check_surprise_removal(struct pci_dev *pdev)
 	return false; /* device present */
 }
 
-static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
-{
-	struct request *rq;
-
-	if (mtip_check_surprise_removal(dd->pdev))
-		return NULL;
-
-	rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
-	if (IS_ERR(rq))
-		return NULL;
-
-	return blk_mq_rq_to_pdu(rq);
-}
-
 static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
 					  unsigned int tag)
 {
@@ -1002,12 +988,15 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	int_cmd = mtip_get_int_command(dd);
-	if (!int_cmd) {
+	if (mtip_check_surprise_removal(dd->pdev))
+		return -EFAULT;
+
+	rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
+	if (IS_ERR(rq)) {
 		dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n");
 		return -EFAULT;
 	}
-	rq = blk_mq_rq_from_pdu(int_cmd);
+
 	rq->special = &icmd;
 
 	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
@@ -1029,6 +1018,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	}
 
 	/* Copy the command to the command table */
+	int_cmd = blk_mq_rq_to_pdu(rq);
 	memcpy(int_cmd->command, fis, fis_len*4);
 
 	rq->timeout = timeout;

From d85cb20453bc98da47ab4393a1a05afcafb39a0f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:01 +0100
Subject: [PATCH 084/351] mtip32xx: don't use req->special

Instead create add to the icmd into struct mtip_cmd which can be unioned
with the scatterlist used for the normal I/O path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 5 ++---
 drivers/block/mtip32xx/mtip32xx.h | 7 ++++++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 5ae86224b2f5..e99db2c9118f 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -997,8 +997,6 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	rq->special = &icmd;
-
 	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
 
 	if (fis->command == ATA_CMD_SEC_ERASE_PREP)
@@ -1019,6 +1017,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 
 	/* Copy the command to the command table */
 	int_cmd = blk_mq_rq_to_pdu(rq);
+	int_cmd->icmd = &icmd;
 	memcpy(int_cmd->command, fis, fis_len*4);
 
 	rq->timeout = timeout;
@@ -3548,8 +3547,8 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 		struct request *rq)
 {
 	struct driver_data *dd = hctx->queue->queuedata;
-	struct mtip_int_cmd *icmd = rq->special;
 	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mtip_int_cmd *icmd = cmd->icmd;
 	struct mtip_cmd_hdr *hdr =
 		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
 	struct mtip_cmd_sg *command_sg;
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index 63414928f07c..c33f8c3d9fb4 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -321,6 +321,8 @@ struct mtip_cmd_sg {
 };
 struct mtip_port;
 
+struct mtip_int_cmd;
+
 /* Structure used to describe a command. */
 struct mtip_cmd {
 	void *command; /* ptr to command table entry */
@@ -331,7 +333,10 @@ struct mtip_cmd {
 
 	int unaligned; /* command is unaligned on 4k boundary */
 
-	struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+	union {
+		struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+		struct mtip_int_cmd *icmd;
+	};
 
 	int retries; /* The number of retries left for this command. */
 

From 27d420bc475e68c85d567d96caf215999d76fd16 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:49:02 +0100
Subject: [PATCH 085/351] mtip32xxx: use for_each_sg

Use the proper helper instead of manually iterating the scatterlist,
which is broken in the presence of chained S/G lists.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index e99db2c9118f..a4c44db097e0 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -1549,11 +1549,11 @@ static inline void fill_command_sg(struct driver_data *dd,
 	int n;
 	unsigned int dma_len;
 	struct mtip_cmd_sg *command_sg;
-	struct scatterlist *sg = command->sg;
+	struct scatterlist *sg;
 
 	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
 
-	for (n = 0; n < nents; n++) {
+	for_each_sg(command->sg, sg, nents, n) {
 		dma_len = sg_dma_len(sg);
 		if (dma_len > 0x400000)
 			dev_err(&dd->pdev->dev,
@@ -1563,7 +1563,6 @@ static inline void fill_command_sg(struct driver_data *dd,
 		command_sg->dba_upper =
 			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
 		command_sg++;
-		sg++;
 	}
 }
 

From 9d037ad707ed6069fbea4e38e6ee37e027b13f1d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 19:37:44 +0100
Subject: [PATCH 086/351] block: remove req->timeout_list

Unused now that the legacy request path is gone.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  1 -
 block/blk-mq.c         |  1 -
 block/blk-timeout.c    | 12 ------------
 block/blk.h            |  2 --
 include/linux/blkdev.h |  2 --
 5 files changed, 18 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3daab9df24e0..fdc0ad2686c4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -144,7 +144,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	memset(rq, 0, sizeof(*rq));
 
 	INIT_LIST_HEAD(&rq->queuelist);
-	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4880e13e2394..411be60d0cb6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -327,7 +327,6 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->extra_len = 0;
 	rq->__deadline = 0;
 
-	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
 
 	rq->end_io = NULL;
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 6428d458072a..006cff4390c0 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -68,16 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
 
 #endif /* CONFIG_FAIL_IO_TIMEOUT */
 
-/*
- * blk_delete_timer - Delete/cancel timer for a given function.
- * @req:	request that we are canceling timer for
- *
- */
-void blk_delete_timer(struct request *req)
-{
-	list_del_init(&req->timeout_list);
-}
-
 /**
  * blk_abort_request -- Request request recovery for the specified command
  * @req:	pointer to the request of interest
@@ -123,8 +113,6 @@ void blk_add_timer(struct request *req)
 	struct request_queue *q = req->q;
 	unsigned long expiry;
 
-	BUG_ON(!list_empty(&req->timeout_list));
-
 	/*
 	 * Some LLDs, like scsi, peek at the timeout to prevent a
 	 * command from being retried forever.
diff --git a/block/blk.h b/block/blk.h
index 78ae94886acf..41b64e6e101b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -222,8 +222,6 @@ static inline bool bio_integrity_endio(struct bio *bio)
 
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
-void blk_delete_timer(struct request *);
-
 
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 			     struct bio *bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b1f470cc784..dc2a6f625ecb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -228,8 +228,6 @@ struct request {
 	/* access through blk_rq_set_deadline, blk_rq_deadline */
 	unsigned long __deadline;
 
-	struct list_head timeout_list;
-
 	union {
 		struct __call_single_data csd;
 		u64 fifo_time;

From 535ac5d3fe63b9ea1dda379f606f9d0d377d7184 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:35 +0100
Subject: [PATCH 087/351] ide: cleanup ->prep_rq calling convention

The return value is just used as a binary yes/no decision, so switch
it to a bool instead of the old BLKPREP_* values returned as an int.

Also clean up a few related comments.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-cd.c   | 22 +++++++++++-----------
 drivers/ide/ide-disk.c |  8 ++++----
 drivers/ide/ide-io.c   |  4 ++--
 include/linux/ide.h    |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4ecaf2ace4cb..69c1aede5f93 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -527,8 +527,8 @@ static bool ide_cd_error_cmd(ide_drive_t *drive, struct ide_cmd *cmd)
 	return false;
 }
 
-/* standard prep_rq_fn that builds 10 byte cmds */
-static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
+/* standard prep_rq that builds 10 byte cmds */
+static bool ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
 	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
@@ -554,14 +554,14 @@ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 	req->cmd[7] = (blocks >> 8) & 0xff;
 	req->cmd[8] = blocks & 0xff;
 	req->cmd_len = 10;
-	return BLKPREP_OK;
+	return true;
 }
 
 /*
  * Most of the SCSI commands are supported directly by ATAPI devices.
  * This transform handles the few exceptions.
  */
-static int ide_cdrom_prep_pc(struct request *rq)
+static bool ide_cdrom_prep_pc(struct request *rq)
 {
 	u8 *c = scsi_req(rq)->cmd;
 
@@ -575,7 +575,7 @@ static int ide_cdrom_prep_pc(struct request *rq)
 		c[1] &= 0xe0;
 		c[0] += (READ_10 - READ_6);
 		scsi_req(rq)->cmd_len = 10;
-		return BLKPREP_OK;
+		return true;
 	}
 
 	/*
@@ -585,13 +585,13 @@ static int ide_cdrom_prep_pc(struct request *rq)
 	 */
 	if (c[0] == MODE_SENSE || c[0] == MODE_SELECT) {
 		scsi_req(rq)->result = ILLEGAL_REQUEST;
-		return BLKPREP_KILL;
+		return false;
 	}
 
-	return BLKPREP_OK;
+	return true;
 }
 
-static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
+static bool ide_cdrom_prep_rq(ide_drive_t *drive, struct request *rq)
 {
 	if (!blk_rq_is_passthrough(rq)) {
 		scsi_req_init(scsi_req(rq));
@@ -600,7 +600,7 @@ static int ide_cdrom_prep_fn(ide_drive_t *drive, struct request *rq)
 	} else if (blk_rq_is_scsi(rq))
 		return ide_cdrom_prep_pc(rq);
 
-	return 0;
+	return true;
 }
 
 static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
@@ -818,7 +818,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 		 * We may be retrying this request after an error.  Fix up any
 		 * weirdness which might be present in the request packet.
 		 */
-		ide_cdrom_prep_fn(drive, rq);
+		ide_cdrom_prep_rq(drive, rq);
 	}
 
 	/* fs requests *must* be hardware frame aligned */
@@ -1521,7 +1521,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	ide_debug_log(IDE_DBG_PROBE, "enter");
 
-	drive->prep_rq = ide_cdrom_prep_fn;
+	drive->prep_rq = ide_cdrom_prep_rq;
 	blk_queue_dma_alignment(q, 31);
 	blk_queue_update_dma_pad(q, 15);
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f8567c8c9dd1..724db9af0d82 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -427,12 +427,12 @@ static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 
-static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
+static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct ide_cmd *cmd;
 
 	if (req_op(rq) != REQ_OP_FLUSH)
-		return BLKPREP_OK;
+		return true;
 
 	if (rq->special) {
 		cmd = rq->special;
@@ -458,7 +458,7 @@ static int idedisk_prep_fn(ide_drive_t *drive, struct request *rq)
 	rq->special = cmd;
 	cmd->rq = rq;
 
-	return BLKPREP_OK;
+	return true;
 }
 
 ide_devset_get(multcount, mult_count);
@@ -547,7 +547,7 @@ static void update_flush(ide_drive_t *drive)
 
 		if (barrier) {
 			wc = true;
-			drive->prep_rq = idedisk_prep_fn;
+			drive->prep_rq = idedisk_prep_rq;
 		}
 	}
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 5093c605c91c..64e72640acf8 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -326,7 +326,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		goto kill_rq;
 	}
 
-	if (drive->prep_rq && drive->prep_rq(drive, rq))
+	if (drive->prep_rq && !drive->prep_rq(drive, rq))
 		return ide_stopped;
 
 	if (ata_pm_request(rq))
@@ -508,7 +508,7 @@ repeat:
 
 		/*
 		 * we know that the queue isn't empty, but this can happen
-		 * if the q->prep_rq_fn() decides to kill a request
+		 * if ->prep_rq() decides to kill a request
 		 */
 		if (!rq) {
 			rq = bd->rq;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 079f8bc0b0f4..272704ff21ee 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -529,7 +529,7 @@ struct ide_drive_s {
 
 	struct request_queue	*queue;	/* request queue */
 
-	int (*prep_rq)(struct ide_drive_s *, struct request *);
+	bool (*prep_rq)(struct ide_drive_s *, struct request *);
 
 	struct blk_mq_tag_set	tag_set;
 

From c092d4ec53c9c7e690ad517b414078db4da6870b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:36 +0100
Subject: [PATCH 088/351] scsi: simplify scsi_prep_state_check

Return a blk_status_t directly, and make the code a little more compact
by handling the fast path in the caller.

Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 102 +++++++++++++++++++---------------------
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ed81b8e74cfe..5ecabb3e77b7 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1240,60 +1240,48 @@ static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
 		return scsi_setup_fs_cmnd(sdev, req);
 }
 
-static int
+static blk_status_t
 scsi_prep_state_check(struct scsi_device *sdev, struct request *req)
 {
-	int ret = BLKPREP_OK;
-
-	/*
-	 * If the device is not in running state we will reject some
-	 * or all commands.
-	 */
-	if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
-		switch (sdev->sdev_state) {
-		case SDEV_OFFLINE:
-		case SDEV_TRANSPORT_OFFLINE:
-			/*
-			 * If the device is offline we refuse to process any
-			 * commands.  The device must be brought online
-			 * before trying any recovery commands.
-			 */
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to offline device\n");
-			ret = BLKPREP_KILL;
-			break;
-		case SDEV_DEL:
-			/*
-			 * If the device is fully deleted, we refuse to
-			 * process any commands as well.
-			 */
-			sdev_printk(KERN_ERR, sdev,
-				    "rejecting I/O to dead device\n");
-			ret = BLKPREP_KILL;
-			break;
-		case SDEV_BLOCK:
-		case SDEV_CREATED_BLOCK:
-			ret = BLKPREP_DEFER;
-			break;
-		case SDEV_QUIESCE:
-			/*
-			 * If the devices is blocked we defer normal commands.
-			 */
-			if (req && !(req->rq_flags & RQF_PREEMPT))
-				ret = BLKPREP_DEFER;
-			break;
-		default:
-			/*
-			 * For any other not fully online state we only allow
-			 * special commands.  In particular any user initiated
-			 * command is not allowed.
-			 */
-			if (req && !(req->rq_flags & RQF_PREEMPT))
-				ret = BLKPREP_KILL;
-			break;
-		}
+	switch (sdev->sdev_state) {
+	case SDEV_OFFLINE:
+	case SDEV_TRANSPORT_OFFLINE:
+		/*
+		 * If the device is offline we refuse to process any
+		 * commands.  The device must be brought online
+		 * before trying any recovery commands.
+		 */
+		sdev_printk(KERN_ERR, sdev,
+			    "rejecting I/O to offline device\n");
+		return BLK_STS_IOERR;
+	case SDEV_DEL:
+		/*
+		 * If the device is fully deleted, we refuse to
+		 * process any commands as well.
+		 */
+		sdev_printk(KERN_ERR, sdev,
+			    "rejecting I/O to dead device\n");
+		return BLK_STS_IOERR;
+	case SDEV_BLOCK:
+	case SDEV_CREATED_BLOCK:
+		return BLK_STS_RESOURCE;
+	case SDEV_QUIESCE:
+		/*
+		 * If the devices is blocked we defer normal commands.
+		 */
+		if (req && !(req->rq_flags & RQF_PREEMPT))
+			return BLK_STS_RESOURCE;
+		return BLK_STS_OK;
+	default:
+		/*
+		 * For any other not fully online state we only allow
+		 * special commands.  In particular any user initiated
+		 * command is not allowed.
+		 */
+		if (req && !(req->rq_flags & RQF_PREEMPT))
+			return BLK_STS_IOERR;
+		return BLK_STS_OK;
 	}
-	return ret;
 }
 
 /*
@@ -1700,9 +1688,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	blk_status_t ret;
 	int reason;
 
-	ret = prep_to_mq(scsi_prep_state_check(sdev, req));
-	if (ret != BLK_STS_OK)
-		goto out_put_budget;
+	/*
+	 * If the device is not in running state we will reject some or all
+	 * commands.
+	 */
+	if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
+		ret = scsi_prep_state_check(sdev, req);
+		if (ret != BLK_STS_OK)
+			goto out_put_budget;
+	}
 
 	ret = BLK_STS_RESOURCE;
 	if (!scsi_target_queue_ready(shost, sdev))

From 785ba83b4f3e4fde236f03205dd1cd98fd6a5255 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:37 +0100
Subject: [PATCH 089/351] scsi: push blk_status_t up into
 scsi_setup_{fs,scsi}_cmnd

This just moves the prep_to_mq calls up in preparation of further removal
of BLKPREP_* usage.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 45 ++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5ecabb3e77b7..e665c25da144 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1177,7 +1177,20 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	scsi_add_cmd_to_list(cmd);
 }
 
-static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
+static inline blk_status_t prep_to_mq(int ret)
+{
+	switch (ret) {
+	case BLKPREP_OK:
+		return BLK_STS_OK;
+	case BLKPREP_DEFER:
+		return BLK_STS_RESOURCE;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
@@ -1190,7 +1203,7 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	if (req->bio) {
 		int ret = scsi_init_io(cmd);
 		if (unlikely(ret))
-			return ret;
+			return prep_to_mq(ret);
 	} else {
 		BUG_ON(blk_rq_bytes(req));
 
@@ -1201,29 +1214,31 @@ static int scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req)
 	cmd->cmnd = scsi_req(req)->cmd;
 	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = scsi_req(req)->retries;
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /*
  * Setup a normal block command.  These are simple request from filesystems
  * that still need to be translated to SCSI CDBs from the ULD.
  */
-static int scsi_setup_fs_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
 		int ret = sdev->handler->prep_fn(sdev, req);
 		if (ret != BLKPREP_OK)
-			return ret;
+			return prep_to_mq(ret);
 	}
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
-	return scsi_cmd_to_driver(cmd)->init_command(cmd);
+	return prep_to_mq(scsi_cmd_to_driver(cmd)->init_command(cmd));
 }
 
-static int scsi_setup_cmnd(struct scsi_device *sdev, struct request *req)
+static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
@@ -1581,18 +1596,6 @@ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
 	return 0;
 }
 
-static inline blk_status_t prep_to_mq(int ret)
-{
-	switch (ret) {
-	case BLKPREP_OK:
-		return BLK_STS_OK;
-	case BLKPREP_DEFER:
-		return BLK_STS_RESOURCE;
-	default:
-		return BLK_STS_IOERR;
-	}
-}
-
 /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */
 static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
 {
@@ -1600,7 +1603,7 @@ static unsigned int scsi_mq_sgl_size(struct Scsi_Host *shost)
 		sizeof(struct scatterlist);
 }
 
-static int scsi_mq_prep_fn(struct request *req)
+static blk_status_t scsi_mq_prep_fn(struct request *req)
 {
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 	struct scsi_device *sdev = req->q->queuedata;
@@ -1705,7 +1708,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 		goto out_dec_target_busy;
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
-		ret = prep_to_mq(scsi_mq_prep_fn(req));
+		ret = scsi_mq_prep_fn(req);
 		if (ret != BLK_STS_OK)
 			goto out_dec_host_busy;
 		req->rq_flags |= RQF_DONTPREP;

From 14784565f740e862adae4b1d7c91f51b4038c4f5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:38 +0100
Subject: [PATCH 090/351] scsi: clean up error handling in scsi_init_io

There is no need to call scsi_mq_free_sgtables until we have actually
allocated sgtables.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e665c25da144..1f84e2cec57b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1044,31 +1044,30 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 	int error = BLKPREP_KILL;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
-		goto err_exit;
+		return BLKPREP_KILL;
 
 	error = scsi_init_sgtable(rq, &cmd->sdb);
 	if (error)
-		goto err_exit;
+		return error;
 
 	if (blk_bidi_rq(rq)) {
 		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
 		if (error)
-			goto err_exit;
+			goto out_free_sgtables;
 	}
 
 	if (blk_integrity_rq(rq)) {
 		struct scsi_data_buffer *prot_sdb = cmd->prot_sdb;
 		int ivecs, count;
 
-		if (prot_sdb == NULL) {
+		if (WARN_ON_ONCE(!prot_sdb)) {
 			/*
 			 * This can happen if someone (e.g. multipath)
 			 * queues a command to a device on an adapter
 			 * that does not support DIX.
 			 */
-			WARN_ON_ONCE(1);
 			error = BLKPREP_KILL;
-			goto err_exit;
+			goto out_free_sgtables;
 		}
 
 		ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio);
@@ -1076,7 +1075,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
 				prot_sdb->table.sgl)) {
 			error = BLKPREP_DEFER;
-			goto err_exit;
+			goto out_free_sgtables;
 		}
 
 		count = blk_rq_map_integrity_sg(rq->q, rq->bio,
@@ -1089,7 +1088,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 	}
 
 	return BLKPREP_OK;
-err_exit:
+out_free_sgtables:
 	scsi_mq_free_sgtables(cmd);
 	return error;
 }

From 159b2cbf59f44f2a0c005c1f323f8f05fb0a19f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:39 +0100
Subject: [PATCH 091/351] scsi: return blk_status_t from scsi_init_io and
 ->init_command

Replace the old BLKPREP_* values with the BLK_STS_ ones that they are
converted to later anyway.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c    | 45 ++++++++++----------
 drivers/scsi/sd.c          | 85 +++++++++++++++++---------------------
 drivers/scsi/sd.h          |  6 +--
 drivers/scsi/sd_zbc.c      | 10 ++---
 drivers/scsi/sr.c          | 12 +++---
 include/scsi/scsi_cmnd.h   |  2 +-
 include/scsi/scsi_driver.h |  3 +-
 7 files changed, 78 insertions(+), 85 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 1f84e2cec57b..3e3bdeee8f14 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1005,7 +1005,8 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 		scsi_io_completion_action(cmd, result);
 }
 
-static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
+static blk_status_t scsi_init_sgtable(struct request *req,
+		struct scsi_data_buffer *sdb)
 {
 	int count;
 
@@ -1014,7 +1015,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
 	 */
 	if (unlikely(sg_alloc_table_chained(&sdb->table,
 			blk_rq_nr_phys_segments(req), sdb->table.sgl)))
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 
 	/* 
 	 * Next, walk the list, and fill in the addresses and sizes of
@@ -1024,7 +1025,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
 	sdb->length = blk_rq_payload_bytes(req);
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /*
@@ -1034,25 +1035,25 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb)
  *
  * Arguments:   cmd   - Command descriptor we wish to initialize
  *
- * Returns:     0 on success
- *		BLKPREP_DEFER if the failure is retryable
- *		BLKPREP_KILL if the failure is fatal
+ * Returns:     BLK_STS_OK on success
+ *		BLK_STS_RESOURCE if the failure is retryable
+ *		BLK_STS_IOERR if the failure is fatal
  */
-int scsi_init_io(struct scsi_cmnd *cmd)
+blk_status_t scsi_init_io(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
-	int error = BLKPREP_KILL;
+	blk_status_t ret;
 
 	if (WARN_ON_ONCE(!blk_rq_nr_phys_segments(rq)))
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
-	error = scsi_init_sgtable(rq, &cmd->sdb);
-	if (error)
-		return error;
+	ret = scsi_init_sgtable(rq, &cmd->sdb);
+	if (ret)
+		return ret;
 
 	if (blk_bidi_rq(rq)) {
-		error = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
-		if (error)
+		ret = scsi_init_sgtable(rq->next_rq, rq->next_rq->special);
+		if (ret)
 			goto out_free_sgtables;
 	}
 
@@ -1066,7 +1067,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 			 * queues a command to a device on an adapter
 			 * that does not support DIX.
 			 */
-			error = BLKPREP_KILL;
+			ret = BLK_STS_IOERR;
 			goto out_free_sgtables;
 		}
 
@@ -1074,7 +1075,7 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 
 		if (sg_alloc_table_chained(&prot_sdb->table, ivecs,
 				prot_sdb->table.sgl)) {
-			error = BLKPREP_DEFER;
+			ret = BLK_STS_RESOURCE;
 			goto out_free_sgtables;
 		}
 
@@ -1087,10 +1088,10 @@ int scsi_init_io(struct scsi_cmnd *cmd)
 		cmd->prot_sdb->table.nents = count;
 	}
 
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 out_free_sgtables:
 	scsi_mq_free_sgtables(cmd);
-	return error;
+	return ret;
 }
 EXPORT_SYMBOL(scsi_init_io);
 
@@ -1200,9 +1201,9 @@ static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
 	 * submit a request without an attached bio.
 	 */
 	if (req->bio) {
-		int ret = scsi_init_io(cmd);
-		if (unlikely(ret))
-			return prep_to_mq(ret);
+		blk_status_t ret = scsi_init_io(cmd);
+		if (unlikely(ret != BLK_STS_OK))
+			return ret;
 	} else {
 		BUG_ON(blk_rq_bytes(req));
 
@@ -1233,7 +1234,7 @@ static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
 	memset(cmd->cmnd, 0, BLK_MAX_CDB);
-	return prep_to_mq(scsi_cmd_to_driver(cmd)->init_command(cmd));
+	return scsi_cmd_to_driver(cmd)->init_command(cmd);
 }
 
 static blk_status_t scsi_setup_cmnd(struct scsi_device *sdev,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3bb2b3351e35..4a6ed2fc8c71 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -114,7 +114,7 @@ static int sd_suspend_system(struct device *);
 static int sd_suspend_runtime(struct device *);
 static int sd_resume(struct device *);
 static void sd_rescan(struct device *);
-static int sd_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sd_init_command(struct scsi_cmnd *SCpnt);
 static void sd_uninit_command(struct scsi_cmnd *SCpnt);
 static int sd_done(struct scsi_cmnd *);
 static void sd_eh_reset(struct scsi_cmnd *);
@@ -750,7 +750,7 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode)
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
 }
 
-static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -761,7 +761,7 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -784,7 +784,8 @@ static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd,
+		bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -794,7 +795,7 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -814,7 +815,8 @@ static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
+static blk_status_t sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd,
+		bool unmap)
 {
 	struct scsi_device *sdp = cmd->device;
 	struct request *rq = cmd->request;
@@ -824,7 +826,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 
 	rq->special_vec.bv_page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
 	if (!rq->special_vec.bv_page)
-		return BLKPREP_DEFER;
+		return BLK_STS_RESOURCE;
 	rq->special_vec.bv_offset = 0;
 	rq->special_vec.bv_len = data_len;
 	rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
@@ -844,7 +846,7 @@ static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap)
 	return scsi_init_io(cmd);
 }
 
-static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_device *sdp = cmd->device;
@@ -862,7 +864,7 @@ static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd)
 	}
 
 	if (sdp->no_write_same)
-		return BLKPREP_INVALID;
+		return BLK_STS_TARGET;
 
 	if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff)
 		return sd_setup_write_same16_cmnd(cmd, false);
@@ -939,7 +941,7 @@ out:
  * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on
  * the preference indicated by the target device.
  **/
-static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_device *sdp = cmd->device;
@@ -948,10 +950,10 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 	sector_t sector = blk_rq_pos(rq);
 	unsigned int nr_sectors = blk_rq_sectors(rq);
 	unsigned int nr_bytes = blk_rq_bytes(rq);
-	int ret;
+	blk_status_t ret;
 
 	if (sdkp->device->no_write_same)
-		return BLKPREP_INVALID;
+		return BLK_STS_TARGET;
 
 	BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size);
 
@@ -992,7 +994,7 @@ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd)
 	return ret;
 }
 
-static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
+static blk_status_t sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 
@@ -1005,10 +1007,10 @@ static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd)
 	cmd->allowed = SD_MAX_RETRIES;
 
 	rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER;
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
-static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
+static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 {
 	struct request *rq = SCpnt->request;
 	struct scsi_device *sdp = SCpnt->device;
@@ -1018,18 +1020,14 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	sector_t threshold;
 	unsigned int this_count = blk_rq_sectors(rq);
 	unsigned int dif, dix;
-	int ret;
 	unsigned char protect;
+	blk_status_t ret;
 
 	ret = scsi_init_io(SCpnt);
-	if (ret != BLKPREP_OK)
+	if (ret != BLK_STS_OK)
 		return ret;
 	WARN_ON_ONCE(SCpnt != rq->special);
 
-	/* from here on until we're complete, any goto out
-	 * is used for a killable error condition */
-	ret = BLKPREP_KILL;
-
 	SCSI_LOG_HLQUEUE(1,
 		scmd_printk(KERN_INFO, SCpnt,
 			"%s: block=%llu, count=%d\n",
@@ -1042,7 +1040,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 						blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
 						"Retry with 0x%p\n", SCpnt));
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	if (sdp->changed) {
@@ -1051,7 +1049,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		 * the changed bit has been reset
 		 */
 		/* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	/*
@@ -1089,31 +1087,28 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 1;
-			this_count = this_count >> 1;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 1;
+		this_count = this_count >> 1;
 	}
 	if (sdp->sector_size == 2048) {
 		if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 2;
-			this_count = this_count >> 2;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 2;
+		this_count = this_count >> 2;
 	}
 	if (sdp->sector_size == 4096) {
 		if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
-			goto out;
-		} else {
-			block = block >> 3;
-			this_count = this_count >> 3;
+			return BLK_STS_IOERR;
 		}
+		block = block >> 3;
+		this_count = this_count >> 3;
 	}
 	if (rq_data_dir(rq) == WRITE) {
 		SCpnt->cmnd[0] = WRITE_6;
@@ -1125,7 +1120,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 		SCpnt->cmnd[0] = READ_6;
 	} else {
 		scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq));
-		goto out;
+		return BLK_STS_IOERR;
 	}
 
 	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
@@ -1145,10 +1140,8 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) {
 		SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
 
-		if (unlikely(SCpnt->cmnd == NULL)) {
-			ret = BLKPREP_DEFER;
-			goto out;
-		}
+		if (unlikely(!SCpnt->cmnd))
+			return BLK_STS_RESOURCE;
 
 		SCpnt->cmd_len = SD_EXT_CDB_SIZE;
 		memset(SCpnt->cmnd, 0, SCpnt->cmd_len);
@@ -1216,7 +1209,7 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 			 */
 			scmd_printk(KERN_ERR, SCpnt,
 				    "FUA write on READ/WRITE(6) drive\n");
-			goto out;
+			return BLK_STS_IOERR;
 		}
 
 		SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
@@ -1240,12 +1233,10 @@ static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt)
 	 * This indicates that the command is ready from our end to be
 	 * queued.
 	 */
-	ret = BLKPREP_OK;
- out:
-	return ret;
+	return BLK_STS_OK;
 }
 
-static int sd_init_command(struct scsi_cmnd *cmd)
+static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 
@@ -1261,7 +1252,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		case SD_LBP_ZERO:
 			return sd_setup_write_same10_cmnd(cmd, false);
 		default:
-			return BLKPREP_INVALID;
+			return BLK_STS_TARGET;
 		}
 	case REQ_OP_WRITE_ZEROES:
 		return sd_setup_write_zeroes_cmnd(cmd);
@@ -1276,7 +1267,7 @@ static int sd_init_command(struct scsi_cmnd *cmd)
 		return sd_zbc_setup_reset_cmnd(cmd);
 	default:
 		WARN_ON_ONCE(1);
-		return BLKPREP_KILL;
+		return BLK_STS_NOTSUPP;
 	}
 }
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 1d63f3a23ffb..7f43e6839bce 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -271,7 +271,7 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 
 extern int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer);
 extern void sd_zbc_print_zones(struct scsi_disk *sdkp);
-extern int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
+extern blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd);
 extern void sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 			    struct scsi_sense_hdr *sshdr);
 extern int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
@@ -288,9 +288,9 @@ static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
 
 static inline void sd_zbc_print_zones(struct scsi_disk *sdkp) {}
 
-static inline int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+static inline blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
-	return BLKPREP_INVALID;
+	return BLK_STS_TARGET;
 }
 
 static inline void sd_zbc_complete(struct scsi_cmnd *cmd,
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index e06c48c866e4..83365b29a4d8 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -185,7 +185,7 @@ static inline sector_t sd_zbc_zone_sectors(struct scsi_disk *sdkp)
  *
  * Called from sd_init_command() for a REQ_OP_ZONE_RESET request.
  */
-int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
+blk_status_t sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 {
 	struct request *rq = cmd->request;
 	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
@@ -194,14 +194,14 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 
 	if (!sd_is_zoned(sdkp))
 		/* Not a zoned device */
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	if (sdkp->device->changed)
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	if (sector & (sd_zbc_zone_sectors(sdkp) - 1))
 		/* Unaligned request */
-		return BLKPREP_KILL;
+		return BLK_STS_IOERR;
 
 	cmd->cmd_len = 16;
 	memset(cmd->cmnd, 0, cmd->cmd_len);
@@ -214,7 +214,7 @@ int sd_zbc_setup_reset_cmnd(struct scsi_cmnd *cmd)
 	cmd->transfersize = 0;
 	cmd->allowed = 0;
 
-	return BLKPREP_OK;
+	return BLK_STS_OK;
 }
 
 /**
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 54dd70ae9731..38ddbbfe5f3c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -80,7 +80,7 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_WORM);
 static DEFINE_MUTEX(sr_mutex);
 static int sr_probe(struct device *);
 static int sr_remove(struct device *);
-static int sr_init_command(struct scsi_cmnd *SCpnt);
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt);
 static int sr_done(struct scsi_cmnd *);
 static int sr_runtime_suspend(struct device *dev);
 
@@ -384,22 +384,22 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 	return good_bytes;
 }
 
-static int sr_init_command(struct scsi_cmnd *SCpnt)
+static blk_status_t sr_init_command(struct scsi_cmnd *SCpnt)
 {
 	int block = 0, this_count, s_size;
 	struct scsi_cd *cd;
 	struct request *rq = SCpnt->request;
-	int ret;
+	blk_status_t ret;
 
 	ret = scsi_init_io(SCpnt);
-	if (ret != BLKPREP_OK)
+	if (ret != BLK_STS_OK)
 		goto out;
 	WARN_ON_ONCE(SCpnt != rq->special);
 	cd = scsi_cd(rq->rq_disk);
 
 	/* from here on until we're complete, any goto out
 	 * is used for a killable error condition */
-	ret = BLKPREP_KILL;
+	ret = BLK_STS_IOERR;
 
 	SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
 		"Doing sr request, block = %d\n", block));
@@ -516,7 +516,7 @@ static int sr_init_command(struct scsi_cmnd *SCpnt)
 	 * This indicates that the command is ready from our end to be
 	 * queued.
 	 */
-	ret = BLKPREP_OK;
+	ret = BLK_STS_OK;
  out:
 	return ret;
 }
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index c891ada3c5c2..d6fd2aba0380 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -171,7 +171,7 @@ extern void *scsi_kmap_atomic_sg(struct scatterlist *sg, int sg_count,
 				 size_t *offset, size_t *len);
 extern void scsi_kunmap_atomic_sg(void *virt);
 
-extern int scsi_init_io(struct scsi_cmnd *cmd);
+extern blk_status_t scsi_init_io(struct scsi_cmnd *cmd);
 
 #ifdef CONFIG_SCSI_DMA
 extern int scsi_dma_map(struct scsi_cmnd *cmd);
diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h
index fae8b465233e..6dffa8555a39 100644
--- a/include/scsi/scsi_driver.h
+++ b/include/scsi/scsi_driver.h
@@ -2,6 +2,7 @@
 #ifndef _SCSI_SCSI_DRIVER_H
 #define _SCSI_SCSI_DRIVER_H
 
+#include <linux/blk_types.h>
 #include <linux/device.h>
 
 struct module;
@@ -13,7 +14,7 @@ struct scsi_driver {
 	struct device_driver	gendrv;
 
 	void (*rescan)(struct device *);
-	int (*init_command)(struct scsi_cmnd *);
+	blk_status_t (*init_command)(struct scsi_cmnd *);
 	void (*uninit_command)(struct scsi_cmnd *);
 	int (*done)(struct scsi_cmnd *);
 	int (*eh_action)(struct scsi_cmnd *, int);

From 4c1cb67c03511a4a404aaaeb206222d5bebdc4ca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:40 +0100
Subject: [PATCH 092/351] scsi: return blk_status_t from device handler
 ->prep_fn

Remove the last use of the old BLKPREP_* values, which get converted
to BLK_STS_* later anyway.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/device_handler/scsi_dh_alua.c  | 23 +++++++++++----------
 drivers/scsi/device_handler/scsi_dh_emc.c   |  8 +++----
 drivers/scsi/device_handler/scsi_dh_hp_sw.c |  7 +++----
 drivers/scsi/device_handler/scsi_dh_rdac.c  |  7 +++----
 drivers/scsi/scsi_lib.c                     | 18 +++-------------
 include/scsi/scsi_dh.h                      |  2 +-
 6 files changed, 26 insertions(+), 39 deletions(-)

diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c
index 12dc7100bb4c..d7ac498ba35a 100644
--- a/drivers/scsi/device_handler/scsi_dh_alua.c
+++ b/drivers/scsi/device_handler/scsi_dh_alua.c
@@ -1071,28 +1071,29 @@ static void alua_check(struct scsi_device *sdev, bool force)
  * Fail I/O to all paths not in state
  * active/optimized or active/non-optimized.
  */
-static int alua_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t alua_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct alua_dh_data *h = sdev->handler_data;
 	struct alua_port_group *pg;
 	unsigned char state = SCSI_ACCESS_STATE_OPTIMAL;
-	int ret = BLKPREP_OK;
 
 	rcu_read_lock();
 	pg = rcu_dereference(h->pg);
 	if (pg)
 		state = pg->state;
 	rcu_read_unlock();
-	if (state == SCSI_ACCESS_STATE_TRANSITIONING)
-		ret = BLKPREP_DEFER;
-	else if (state != SCSI_ACCESS_STATE_OPTIMAL &&
-		 state != SCSI_ACCESS_STATE_ACTIVE &&
-		 state != SCSI_ACCESS_STATE_LBA) {
-		ret = BLKPREP_KILL;
-		req->rq_flags |= RQF_QUIET;
-	}
-	return ret;
 
+	switch (state) {
+	case SCSI_ACCESS_STATE_OPTIMAL:
+	case SCSI_ACCESS_STATE_ACTIVE:
+	case SCSI_ACCESS_STATE_LBA:
+		return BLK_STS_OK;
+	case SCSI_ACCESS_STATE_TRANSITIONING:
+		return BLK_STS_RESOURCE;
+	default:
+		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
+	}
 }
 
 static void alua_rescan(struct scsi_device *sdev)
diff --git a/drivers/scsi/device_handler/scsi_dh_emc.c b/drivers/scsi/device_handler/scsi_dh_emc.c
index 95c47909a58f..bea8e13febb6 100644
--- a/drivers/scsi/device_handler/scsi_dh_emc.c
+++ b/drivers/scsi/device_handler/scsi_dh_emc.c
@@ -341,17 +341,17 @@ static int clariion_check_sense(struct scsi_device *sdev,
 	return SCSI_RETURN_NOT_HANDLED;
 }
 
-static int clariion_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t clariion_prep_fn(struct scsi_device *sdev,
+		struct request *req)
 {
 	struct clariion_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->lun_state != CLARIION_LUN_OWNED) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 static int clariion_std_inquiry(struct scsi_device *sdev,
diff --git a/drivers/scsi/device_handler/scsi_dh_hp_sw.c b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
index e65a0ebb4b54..80129b033855 100644
--- a/drivers/scsi/device_handler/scsi_dh_hp_sw.c
+++ b/drivers/scsi/device_handler/scsi_dh_hp_sw.c
@@ -172,17 +172,16 @@ retry:
 	return rc;
 }
 
-static int hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t hp_sw_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct hp_sw_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->path_state != HP_SW_PATH_ACTIVE) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 /*
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c
index d27fabae8ddd..65f1fe343c64 100644
--- a/drivers/scsi/device_handler/scsi_dh_rdac.c
+++ b/drivers/scsi/device_handler/scsi_dh_rdac.c
@@ -642,17 +642,16 @@ done:
 	return 0;
 }
 
-static int rdac_prep_fn(struct scsi_device *sdev, struct request *req)
+static blk_status_t rdac_prep_fn(struct scsi_device *sdev, struct request *req)
 {
 	struct rdac_dh_data *h = sdev->handler_data;
-	int ret = BLKPREP_OK;
 
 	if (h->state != RDAC_STATE_ACTIVE) {
-		ret = BLKPREP_KILL;
 		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
 	}
-	return ret;
 
+	return BLK_STS_OK;
 }
 
 static int rdac_check_sense(struct scsi_device *sdev,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3e3bdeee8f14..5d83a162d03b 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1177,18 +1177,6 @@ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd)
 	scsi_add_cmd_to_list(cmd);
 }
 
-static inline blk_status_t prep_to_mq(int ret)
-{
-	switch (ret) {
-	case BLKPREP_OK:
-		return BLK_STS_OK;
-	case BLKPREP_DEFER:
-		return BLK_STS_RESOURCE;
-	default:
-		return BLK_STS_IOERR;
-	}
-}
-
 static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev,
 		struct request *req)
 {
@@ -1227,9 +1215,9 @@ static blk_status_t scsi_setup_fs_cmnd(struct scsi_device *sdev,
 	struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
 
 	if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
-		int ret = sdev->handler->prep_fn(sdev, req);
-		if (ret != BLKPREP_OK)
-			return prep_to_mq(ret);
+		blk_status_t ret = sdev->handler->prep_fn(sdev, req);
+		if (ret != BLK_STS_OK)
+			return ret;
 	}
 
 	cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
diff --git a/include/scsi/scsi_dh.h b/include/scsi/scsi_dh.h
index c7bba2b24849..a862dc23c68d 100644
--- a/include/scsi/scsi_dh.h
+++ b/include/scsi/scsi_dh.h
@@ -69,7 +69,7 @@ struct scsi_device_handler {
 	int (*attach)(struct scsi_device *);
 	void (*detach)(struct scsi_device *);
 	int (*activate)(struct scsi_device *, activate_complete, void *);
-	int (*prep_fn)(struct scsi_device *, struct request *);
+	blk_status_t (*prep_fn)(struct scsi_device *, struct request *);
 	int (*set_params)(struct scsi_device *, const char *);
 	void (*rescan)(struct scsi_device *);
 };

From 0e17e06cbf7ede285ab74bab44d888b40c21f828 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 9 Nov 2018 14:42:41 +0100
Subject: [PATCH 093/351] block: remove the BLKPREP_* values.

Unused now.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dc2a6f625ecb..e67ad2dd025e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -776,16 +776,6 @@ static inline unsigned int blk_queue_depth(struct request_queue *q)
 	return q->nr_requests;
 }
 
-/*
- * q->prep_rq_fn return values
- */
-enum {
-	BLKPREP_OK,		/* serve it */
-	BLKPREP_KILL,		/* fatal error, kill, return -EIO */
-	BLKPREP_DEFER,		/* leave on queue */
-	BLKPREP_INVALID,	/* invalid command, kill, return -EREMOTEIO */
-};
-
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 
 /*

From e41128cfd9389b889e91097b8ca61574d1e71482 Mon Sep 17 00:00:00 2001
From: YueHaibing <yuehaibing@huawei.com>
Date: Sat, 10 Nov 2018 02:41:14 +0000
Subject: [PATCH 094/351] block: remove set but not used variable 'et'

Fixes gcc '-Wunused-but-set-variable' warning:

block/blk-ioc.c: In function 'put_io_context_active':
block/blk-ioc.c:174:24: warning:
 variable 'et' set but not used [-Wunused-but-set-variable]

It not used any more after commit
a1ce35fa4985 ("block: remove dead elevator code")

Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-ioc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 007aac6e6a4b..56755ad5ac88 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -171,7 +171,6 @@ EXPORT_SYMBOL(put_io_context);
  */
 void put_io_context_active(struct io_context *ioc)
 {
-	struct elevator_type *et;
 	unsigned long flags;
 	struct io_cq *icq;
 
@@ -190,7 +189,6 @@ void put_io_context_active(struct io_context *ioc)
 		if (icq->flags & ICQ_EXITED)
 			continue;
 
-		et = icq->q->elevator->type;
 		ioc_exit_icq(icq);
 	}
 	spin_unlock_irqrestore(&ioc->lock, flags);

From 511c49fe1804671800947b69281e07719fad25e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:44 +0100
Subject: [PATCH 095/351] fnic: fix fnic_scsi_host_{start,end}_tag

The way these functions abuse ->special to try to store the dummy
request looks completely broken, given that it actually stores the
original scsi command.

Instead switch to ->host_scribble and store the actual dummy command.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/fnic/fnic_scsi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c
index 96acfcecd540..cafbcfb85bfa 100644
--- a/drivers/scsi/fnic/fnic_scsi.c
+++ b/drivers/scsi/fnic/fnic_scsi.c
@@ -2274,7 +2274,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 		return SCSI_NO_TAG;
 
 	sc->tag = sc->request->tag = dummy->tag;
-	sc->request->special = sc;
+	sc->host_scribble = (unsigned char *)dummy;
 
 	return dummy->tag;
 }
@@ -2286,7 +2286,7 @@ fnic_scsi_host_start_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 static inline void
 fnic_scsi_host_end_tag(struct fnic *fnic, struct scsi_cmnd *sc)
 {
-	struct request *dummy = sc->request->special;
+	struct request *dummy = (struct request *)sc->host_scribble;
 
 	blk_mq_free_request(dummy);
 }

From 49f6613632f9504ae59ccbbb92a5b07f474d75b2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:45 +0100
Subject: [PATCH 096/351] nullb: remove leftover legacy request code

null_softirq_done_fn is only used for the blk-mq path, so remove the
other branch.  Also rename the function to better match the method name.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk_main.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 9c045bee0985..16ba3db2a015 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -642,14 +642,11 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
 }
 
-static void null_softirq_done_fn(struct request *rq)
+static void null_complete_rq(struct request *rq)
 {
 	struct nullb *nullb = rq->q->queuedata;
 
-	if (nullb->dev->queue_mode == NULL_Q_MQ)
-		end_cmd(blk_mq_rq_to_pdu(rq));
-	else
-		end_cmd(rq->special);
+	end_cmd(blk_mq_rq_to_pdu(rq));
 }
 
 static struct nullb_page *null_alloc_page(gfp_t gfp_flags)
@@ -1357,7 +1354,7 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 static const struct blk_mq_ops null_mq_ops = {
 	.queue_rq       = null_queue_rq,
-	.complete	= null_softirq_done_fn,
+	.complete	= null_complete_rq,
 	.timeout	= null_timeout_rq,
 };
 

From 1bee42438f32de405987b4a41b93624b184ae481 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:46 +0100
Subject: [PATCH 097/351] skd_main: don't use req->special

Add a retries field to the internal request structure instead, which gets
set to zero on the first submission.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/skd_main.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a0196477165f..a10d5736d8f7 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -181,6 +181,7 @@ struct skd_request_context {
 	struct fit_completion_entry_v1 completion;
 
 	struct fit_comp_error_info err_info;
+	int retries;
 
 	blk_status_t status;
 };
@@ -495,6 +496,11 @@ static blk_status_t skd_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (unlikely(skdev->state != SKD_DRVR_STATE_ONLINE))
 		return skd_fail_all(q) ? BLK_STS_IOERR : BLK_STS_RESOURCE;
 
+	if (!(req->rq_flags & RQF_DONTPREP)) {
+		skreq->retries = 0;
+		req->rq_flags |= RQF_DONTPREP;
+	}
+
 	blk_mq_start_request(req);
 
 	WARN_ONCE(tag >= skd_max_queue_depth, "%#x > %#x (nr_requests = %lu)\n",
@@ -1426,7 +1432,7 @@ static void skd_resolve_req_exception(struct skd_device *skdev,
 		break;
 
 	case SKD_CHECK_STATUS_REQUEUE_REQUEST:
-		if ((unsigned long) ++req->special < SKD_MAX_RETRIES) {
+		if (++skreq->retries < SKD_MAX_RETRIES) {
 			skd_log_skreq(skdev, skreq, "retry");
 			blk_mq_requeue_request(req, true);
 			break;

From 61e7712e25bbe964c9537bb1171bac4df7afa593 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:47 +0100
Subject: [PATCH 098/351] aoe: replace ->special use with private data in the
 request

Makes the code a whole lot easier to read.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoe.h    |  4 ++++
 drivers/block/aoe/aoeblk.c |  1 +
 drivers/block/aoe/aoecmd.c | 27 +++++++++------------------
 drivers/block/aoe/aoedev.c | 11 ++++++-----
 4 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 7ca76ed2e71a..84d0fcebd6af 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -100,6 +100,10 @@ enum {
 	MAX_TAINT = 1000,	/* cap on aoetgt taint */
 };
 
+struct aoe_req {
+	unsigned long nr_bios;
+};
+
 struct buf {
 	ulong nframesout;
 	struct bio *bio;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index ed26b7287256..e2c6aae2d636 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -387,6 +387,7 @@ aoeblk_gdalloc(void *vp)
 
 	set = &d->tag_set;
 	set->ops = &aoeblk_mq_ops;
+	set->cmd_size = sizeof(struct aoe_req);
 	set->nr_hw_queues = 1;
 	set->queue_depth = 128;
 	set->numa_node = NUMA_NO_NODE;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index bb2fba651bd2..3cf9bc5d8d95 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -822,17 +822,6 @@ out:
 	spin_unlock_irqrestore(&d->lock, flags);
 }
 
-static unsigned long
-rqbiocnt(struct request *r)
-{
-	struct bio *bio;
-	unsigned long n = 0;
-
-	__rq_for_each_bio(bio, r)
-		n++;
-	return n;
-}
-
 static void
 bufinit(struct buf *buf, struct request *rq, struct bio *bio)
 {
@@ -847,6 +836,7 @@ nextbuf(struct aoedev *d)
 {
 	struct request *rq;
 	struct request_queue *q;
+	struct aoe_req *req;
 	struct buf *buf;
 	struct bio *bio;
 
@@ -865,7 +855,11 @@ nextbuf(struct aoedev *d)
 		blk_mq_start_request(rq);
 		d->ip.rq = rq;
 		d->ip.nxbio = rq->bio;
-		rq->special = (void *) rqbiocnt(rq);
+
+		req = blk_mq_rq_to_pdu(rq);
+		req->nr_bios = 0;
+		__rq_for_each_bio(bio, rq)
+			req->nr_bios++;
 	}
 	buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
 	if (buf == NULL) {
@@ -1069,16 +1063,13 @@ aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
 static void
 aoe_end_buf(struct aoedev *d, struct buf *buf)
 {
-	struct request *rq;
-	unsigned long n;
+	struct request *rq = buf->rq;
+	struct aoe_req *req = blk_mq_rq_to_pdu(rq);
 
 	if (buf == d->ip.buf)
 		d->ip.buf = NULL;
-	rq = buf->rq;
 	mempool_free(buf, d->bufpool);
-	n = (unsigned long) rq->special;
-	rq->special = (void *) --n;
-	if (n == 0)
+	if (--req->nr_bios == 0)
 		aoe_end_request(d, rq, 0);
 }
 
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 9063f8efbd3b..5b49f1b33ebe 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -160,21 +160,22 @@ static void
 aoe_failip(struct aoedev *d)
 {
 	struct request *rq;
+	struct aoe_req *req;
 	struct bio *bio;
-	unsigned long n;
 
 	aoe_failbuf(d, d->ip.buf);
-
 	rq = d->ip.rq;
 	if (rq == NULL)
 		return;
+
+	req = blk_mq_rq_to_pdu(rq);
 	while ((bio = d->ip.nxbio)) {
 		bio->bi_status = BLK_STS_IOERR;
 		d->ip.nxbio = bio->bi_next;
-		n = (unsigned long) rq->special;
-		rq->special = (void *) --n;
+		req->nr_bios--;
 	}
-	if ((unsigned long) rq->special == 0)
+
+	if (!req->nr_bios)
 		aoe_end_request(d, rq, 0);
 }
 

From 289d088b66182076d33b5579417d429371cf9dfd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:48 +0100
Subject: [PATCH 099/351] pd: replace ->special use with private data in the
 request

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/paride/pd.c | 30 +++++++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 5 deletions(-)

diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index ae4971e5d9a8..0ff9b12d0e35 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -242,6 +242,11 @@ struct pd_unit {
 
 static struct pd_unit pd[PD_UNITS];
 
+struct pd_req {
+	/* for REQ_OP_DRV_IN: */
+	enum action (*func)(struct pd_unit *disk);
+};
+
 static char pd_scratch[512];	/* scratch block buffer */
 
 static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
@@ -502,8 +507,9 @@ static enum action do_pd_io_start(void)
 
 static enum action pd_special(void)
 {
-	enum action (*func)(struct pd_unit *) = pd_req->special;
-	return func(pd_current);
+	struct pd_req *req = blk_mq_rq_to_pdu(pd_req);
+
+	return req->func(pd_current);
 }
 
 static int pd_next_buf(void)
@@ -767,12 +773,14 @@ static int pd_special_command(struct pd_unit *disk,
 		      enum action (*func)(struct pd_unit *disk))
 {
 	struct request *rq;
+	struct pd_req *req;
 
 	rq = blk_get_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
+	req = blk_mq_rq_to_pdu(rq);
 
-	rq->special = func;
+	req->func = func;
 	blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
 	blk_put_request(rq);
 	return 0;
@@ -892,9 +900,21 @@ static void pd_probe_drive(struct pd_unit *disk)
 	disk->gd = p;
 	p->private_data = disk;
 
-	p->queue = blk_mq_init_sq_queue(&disk->tag_set, &pd_mq_ops, 2,
-				BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
+	memset(&disk->tag_set, 0, sizeof(disk->tag_set));
+	disk->tag_set.ops = &pd_mq_ops;
+	disk->tag_set.cmd_size = sizeof(struct pd_req);
+	disk->tag_set.nr_hw_queues = 1;
+	disk->tag_set.nr_maps = 1;
+	disk->tag_set.queue_depth = 2;
+	disk->tag_set.numa_node = NUMA_NO_NODE;
+	disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+
+	if (blk_mq_alloc_tag_set(&disk->tag_set))
+		return;
+
+	p->queue = blk_mq_init_queue(&disk->tag_set);
 	if (IS_ERR(p->queue)) {
+		blk_mq_free_tag_set(&disk->tag_set);
 		p->queue = NULL;
 		return;
 	}

From 22ce0a7ccf23d55d1fdaa2974002f8b5ae765665 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 10 Nov 2018 09:30:49 +0100
Subject: [PATCH 100/351] ide: don't use req->special

Just replace it with a field of the same name in struct ide_req.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c    |  4 ++--
 drivers/ide/ide-cd.c       |  4 ++--
 drivers/ide/ide-devsets.c  |  4 ++--
 drivers/ide/ide-disk.c     |  6 +++---
 drivers/ide/ide-eh.c       |  2 +-
 drivers/ide/ide-floppy.c   |  2 +-
 drivers/ide/ide-io.c       | 14 +++++++++-----
 drivers/ide/ide-park.c     |  4 ++--
 drivers/ide/ide-pm.c       | 12 ++++++------
 drivers/ide/ide-tape.c     |  2 +-
 drivers/ide/ide-taskfile.c |  2 +-
 include/linux/ide.h        |  1 +
 12 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 33210bc67618..da58020a144e 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,7 +94,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	rq->special = (char *)pc;
+	ide_req(rq)->special = pc;
 
 	if (buf && bufflen) {
 		error = blk_rq_map_kern(drive->queue, rq, buf, bufflen,
@@ -244,7 +244,7 @@ int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 		return -ENOMEM;
 	}
 
-	sense_rq->special = special;
+	ide_req(sense_rq)->special = special;
 	drive->sense_rq_armed = false;
 
 	drive->hwif->rq = NULL;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 69c1aede5f93..1f03884a6808 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -211,12 +211,12 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For ATA_PRIV_SENSE, "rq->special" points to the original
+	 * For ATA_PRIV_SENSE, "ide_req(rq)->special" points to the original
 	 * failed request.  Also, the sense data should be read
 	 * directly from rq which might be different from the original
 	 * sense buffer if it got copied during mapping.
 	 */
-	struct request *failed = (struct request *)rq->special;
+	struct request *failed = ide_req(rq)->special;
 	void *sense = bio_data(rq->bio);
 
 	if (failed) {
diff --git a/drivers/ide/ide-devsets.c b/drivers/ide/ide-devsets.c
index f4f8afdf8bbe..f2f93ed40356 100644
--- a/drivers/ide/ide-devsets.c
+++ b/drivers/ide/ide-devsets.c
@@ -171,7 +171,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 	scsi_req(rq)->cmd_len = 5;
 	scsi_req(rq)->cmd[0] = REQ_DEVSET_EXEC;
 	*(int *)&scsi_req(rq)->cmd[1] = arg;
-	rq->special = setting->set;
+	ide_req(rq)->special = setting->set;
 
 	blk_execute_rq(q, NULL, rq, 0);
 	ret = scsi_req(rq)->result;
@@ -182,7 +182,7 @@ int ide_devset_execute(ide_drive_t *drive, const struct ide_devset *setting,
 
 ide_startstop_t ide_do_devset(ide_drive_t *drive, struct request *rq)
 {
-	int err, (*setfunc)(ide_drive_t *, int) = rq->special;
+	int err, (*setfunc)(ide_drive_t *, int) = ide_req(rq)->special;
 
 	err = setfunc(drive, *(int *)&scsi_req(rq)->cmd[1]);
 	if (err)
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 724db9af0d82..197912af5c2f 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -434,8 +434,8 @@ static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 	if (req_op(rq) != REQ_OP_FLUSH)
 		return true;
 
-	if (rq->special) {
-		cmd = rq->special;
+	if (ide_req(rq)->special) {
+		cmd = ide_req(rq)->special;
 		memset(cmd, 0, sizeof(*cmd));
 	} else {
 		cmd = kzalloc(sizeof(*cmd), GFP_ATOMIC);
@@ -455,7 +455,7 @@ static bool idedisk_prep_rq(ide_drive_t *drive, struct request *rq)
 	rq->cmd_flags &= ~REQ_OP_MASK;
 	rq->cmd_flags |= REQ_OP_DRV_OUT;
 	ide_req(rq)->type = ATA_PRIV_TASKFILE;
-	rq->special = cmd;
+	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
 	return true;
diff --git a/drivers/ide/ide-eh.c b/drivers/ide/ide-eh.c
index 47d5f3379748..e1323e058454 100644
--- a/drivers/ide/ide-eh.c
+++ b/drivers/ide/ide-eh.c
@@ -125,7 +125,7 @@ ide_startstop_t ide_error(ide_drive_t *drive, const char *msg, u8 stat)
 	/* retry only "normal" I/O: */
 	if (blk_rq_is_passthrough(rq)) {
 		if (ata_taskfile_request(rq)) {
-			struct ide_cmd *cmd = rq->special;
+			struct ide_cmd *cmd = ide_req(rq)->special;
 
 			if (cmd)
 				ide_complete_cmd(drive, cmd, stat, err);
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index a8df300f949c..780d33ccc5d8 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -276,7 +276,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		switch (ide_req(rq)->type) {
 		case ATA_PRIV_MISC:
 		case ATA_PRIV_SENSE:
-			pc = (struct ide_atapi_pc *)rq->special;
+			pc = (struct ide_atapi_pc *)ide_req(rq)->special;
 			break;
 		default:
 			BUG();
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 64e72640acf8..94e9c79c41cf 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -111,7 +111,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 	}
 
 	if (rq && ata_taskfile_request(rq)) {
-		struct ide_cmd *orig_cmd = rq->special;
+		struct ide_cmd *orig_cmd = ide_req(rq)->special;
 
 		if (cmd->tf_flags & IDE_TFLAG_DYN)
 			kfree(orig_cmd);
@@ -261,7 +261,7 @@ EXPORT_SYMBOL_GPL(ide_init_sg_cmd);
 static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 		struct request *rq)
 {
-	struct ide_cmd *cmd = rq->special;
+	struct ide_cmd *cmd = ide_req(rq)->special;
 
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
@@ -352,7 +352,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (ata_taskfile_request(rq))
 			return execute_drive_cmd(drive, rq);
 		else if (ata_pm_request(rq)) {
-			struct ide_pm_state *pm = rq->special;
+			struct ide_pm_state *pm = ide_req(rq)->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
@@ -460,16 +460,20 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	ide_drive_t	*drive = hctx->queue->queuedata;
 	ide_hwif_t	*hwif = drive->hwif;
 	struct ide_host *host = hwif->host;
-	struct request	*rq = NULL;
+	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
+	if (!(rq->rq_flags & RQF_DONTPREP)) {
+		rq->rq_flags |= RQF_DONTPREP;
+		ide_req(rq)->special = NULL;
+	}
+
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
 	if (ide_lock_host(host, hwif))
 		return BLK_STS_DEV_RESOURCE;
 
-	rq = bd->rq;
 	blk_mq_start_request(rq);
 
 	spin_lock_irq(&hwif->lock);
diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index de9e85cf74d1..102aa3bc3e7f 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -36,7 +36,7 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 	scsi_req(rq)->cmd[0] = REQ_PARK_HEADS;
 	scsi_req(rq)->cmd_len = 1;
 	ide_req(rq)->type = ATA_PRIV_MISC;
-	rq->special = &timeout;
+	ide_req(rq)->special = &timeout;
 	blk_execute_rq(q, NULL, rq, 1);
 	rc = scsi_req(rq)->result ? -EIO : 0;
 	blk_put_request(rq);
@@ -67,7 +67,7 @@ ide_startstop_t ide_do_park_unpark(ide_drive_t *drive, struct request *rq)
 
 	memset(&cmd, 0, sizeof(cmd));
 	if (scsi_req(rq)->cmd[0] == REQ_PARK_HEADS) {
-		drive->sleep = *(unsigned long *)rq->special;
+		drive->sleep = *(unsigned long *)ide_req(rq)->special;
 		drive->dev_flags |= IDE_DFLAG_SLEEPING;
 		tf->command = ATA_CMD_IDLEIMMEDIATE;
 		tf->feature = 0x44;
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index ea10507e5190..a8c53c98252d 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -21,7 +21,7 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, 0);
 	ide_req(rq)->type = ATA_PRIV_PM_SUSPEND;
-	rq->special = &rqpm;
+	ide_req(rq)->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -82,7 +82,7 @@ int generic_ide_resume(struct device *dev)
 	memset(&rqpm, 0, sizeof(rqpm));
 	rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_PREEMPT);
 	ide_req(rq)->type = ATA_PRIV_PM_RESUME;
-	rq->special = &rqpm;
+	ide_req(rq)->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -101,7 +101,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -131,7 +131,7 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
@@ -203,7 +203,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -228,7 +228,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct ide_pm_state *pm = rq->special;
+	struct ide_pm_state *pm = ide_req(rq)->special;
 
 	if (blk_rq_is_private(rq) &&
 	    ide_req(rq)->type == ATA_PRIV_PM_SUSPEND &&
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 34c1165226a4..db1a65f4b490 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -639,7 +639,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (req->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *)rq->special;
+		pc = (struct ide_atapi_pc *)ide_req(rq)->special;
 		req->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		req->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index c21d5c50ae3a..17b2e379e872 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -440,7 +440,7 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 			goto put_req;
 	}
 
-	rq->special = cmd;
+	ide_req(rq)->special = cmd;
 	cmd->rq = rq;
 
 	blk_execute_rq(drive->queue, NULL, rq, 0);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 272704ff21ee..e7d29ae633cd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -50,6 +50,7 @@ struct ide_request {
 	struct scsi_request sreq;
 	u8 sense[SCSI_SENSE_BUFFERSIZE];
 	u8 type;
+	void *special;
 };
 
 static inline struct ide_request *ide_req(struct request *rq)

From 8e18ebef4dd4c83cea65730821da423553ed6019 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 10 Nov 2018 13:03:52 -0700
Subject: [PATCH 101/351] null_blk: remove unused nullb device
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The compiler rightfully complains:

drivers/block/null_blk_main.c: In function ‘null_complete_rq’:
drivers/block/null_blk_main.c:647:16: warning: unused variable ‘nullb’ [-Wunused-variable]
  struct nullb *nullb = rq->q->queuedata;
                ^~~~~

Fixes: 49f6613632f9 ("nullb: remove leftover legacy request code")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/null_blk_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 16ba3db2a015..63c23fcfc4df 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -644,8 +644,6 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 
 static void null_complete_rq(struct request *rq)
 {
-	struct nullb *nullb = rq->q->queuedata;
-
 	end_cmd(blk_mq_rq_to_pdu(rq));
 }
 

From 628bd85947091830a8c4872adfd5ed1d515a9cf2 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 12 Nov 2018 08:42:14 -0700
Subject: [PATCH 102/351] loop: Fix double mutex_unlock(&loop_ctl_mutex) in
 loop_control_ioctl()

Commit 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex") forgot to
remove mutex_unlock(&loop_ctl_mutex) from loop_control_ioctl() when
replacing loop_index_mutex with loop_ctl_mutex.

Fixes: 0a42e99b58a20883 ("loop: Get rid of loop_index_mutex")
Reported-by: syzbot <syzbot+c0138741c2290fc5e63f@syzkaller.appspotmail.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bf6bc35aaf88..176ab1f28eca 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2074,12 +2074,10 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd,
 			break;
 		if (lo->lo_state != Lo_unbound) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		if (atomic_read(&lo->lo_refcnt) > 0) {
 			ret = -EBUSY;
-			mutex_unlock(&loop_ctl_mutex);
 			break;
 		}
 		lo->lo_disk->private_data = NULL;

From d16a67667c611f00b5ec0017ad2b18f473af13d2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 12 Nov 2018 17:19:32 -0700
Subject: [PATCH 103/351] ide: don't clear special on ide_queue_rq() entry

We can't use RQF_DONTPREP to see if we should clear ->special,
as someone could have set that while inserting the request. Make
sure we clear it in our ->initialize_rq_fn() helper instead.

Fixes: 22ce0a7ccf23 ("ide: don't use req->special")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c    | 5 -----
 drivers/ide/ide-probe.c | 1 +
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 94e9c79c41cf..c0dd0fad16a3 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -463,11 +463,6 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
-	if (!(rq->rq_flags & RQF_DONTPREP)) {
-		rq->rq_flags |= RQF_DONTPREP;
-		ide_req(rq)->special = NULL;
-	}
-
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 40384838e439..63627be0811a 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -746,6 +746,7 @@ static void ide_initialize_rq(struct request *rq)
 {
 	struct ide_request *req = blk_mq_rq_to_pdu(rq);
 
+	req->special = NULL;
 	scsi_req_init(&req->sreq);
 	req->sreq.sense = req->sense;
 }

From 30e066286e232772cad72c87008a958e23e40a33 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 10:13:50 -0700
Subject: [PATCH 104/351] nvme: fix boot hang with only being able to get one
 IRQ vector

NVMe always asks for io_queues + 1 worth of IRQ vectors, which
means that even when we scale all the way down, we still ask
for 2 vectors and get -ENOSPC in return if the system can't
support more than 1.

Getting just 1 vector is fine, it just means that we'll have
1 IO queue and 1 admin queue, with a shared vector between them.
Check for this case and don't add our + 1 if it happens.

Fixes: 3b6592f70ad7 ("nvme: utilize two queue maps, one for reads and one for writes")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 6aa86dfcb32c..ffbab5b01df4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2073,7 +2073,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.nr_sets = ARRAY_SIZE(irq_sets),
 		.sets = irq_sets,
 	};
-	int result;
+	int result = 0;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2088,9 +2088,16 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue
+		 * Need IRQs for read+write queues, and one for the admin queue.
+		 * If we can't get more than one vector, we have to share the
+		 * admin queue and IO queue vector. For that case, don't add
+		 * an extra vector for the admin queue, or we'll continue
+		 * asking for 2 and get -ENOSPC in return.
 		 */
-		nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result == -ENOSPC && nr_io_queues == 1)
+			nr_io_queues = 1;
+		else
+			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
 				nr_io_queues,

From 98c98cb770da9469ee868f21299b2ba21fe438ac Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Wed, 14 Nov 2018 22:17:05 +0000
Subject: [PATCH 105/351] block: clean up dead code that is now redundant

The boolean next_sorted is set to false and is never changed, hence
the code that checks if it is true is dead code and can now be
removed.  This dead code occurred from a previous commit that cleaned
up the elevator and removed the setting of next_sorted to true.

Detected by CoverityScan, CID#1475401 ("'Constant' variable guards
dead code")

Fixes: a1ce35fa4985 ("block: remove dead elevator code")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/elevator.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/block/elevator.c b/block/elevator.c
index 19351ffa56b1..796436270682 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -394,18 +394,11 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	bool next_sorted = false;
 
 	if (e->type->ops.requests_merged)
 		e->type->ops.requests_merged(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
-
-	if (next_sorted) {
-		elv_rqhash_del(q, next);
-		q->nr_sorted--;
-	}
-
 	q->last_merge = rq;
 }
 

From 7ff4f8035695984c513598e2d49c8277d5d234ca Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 15:22:49 -0700
Subject: [PATCH 106/351] block: remove dead queue members

No more users of ->in_flight[] or ->nr_sorted, get rid of them.

Fixes: a1ce35fa4985 ("block: remove dead elevator code")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e67ad2dd025e..c961329be96b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -486,9 +486,6 @@ struct request_queue {
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 
-	unsigned int		nr_sorted;
-	unsigned int		in_flight[2];
-
 	unsigned int		rq_timeout;
 	int			poll_nsec;
 

From e96c0d8336fdc5f1a6cae798bfa9c6e730001ad4 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 14 Nov 2018 17:19:46 -0800
Subject: [PATCH 107/351] block: make blk_try_req_merge() static

blk_try_req_merge() is only used in block/blk-merge.c, so make it
static.

This addresses a gcc warning when -Wmissing-prototypes is enabled.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 91b2af332a84..da0217f321c4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -712,7 +712,8 @@ static inline bool blk_discard_mergable(struct request *req)
 	return false;
 }
 
-enum elv_merge blk_try_req_merge(struct request *req, struct request *next)
+static enum elv_merge blk_try_req_merge(struct request *req,
+					struct request *next)
 {
 	if (blk_discard_mergable(req))
 		return ELEVATOR_DISCARD_MERGE;

From 8f4236d9008b0973a8281256ccfde6913cdec6cb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:04 +0100
Subject: [PATCH 108/351] block: remove QUEUE_FLAG_BYPASS and ->bypass

Unused since the removal of the legacy request code.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 15 ---------------
 block/blk-core.c           | 21 ---------------------
 block/blk-mq-debugfs.c     |  1 -
 block/blk-throttle.c       |  3 ---
 include/linux/blk-cgroup.h |  6 +-----
 include/linux/blkdev.h     |  3 ---
 6 files changed, 1 insertion(+), 48 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6c65791bc3fe..a95cddb39f1c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -270,13 +270,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(q->queue_lock);
 
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
 		return blkg;
@@ -741,14 +734,6 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
 
 	if (!blkcg_policy_enabled(q, pol))
 		return ERR_PTR(-EOPNOTSUPP);
-
-	/*
-	 * This could be the first entry point of blkcg implementation and
-	 * we shouldn't allow anything to go through for a bypassing queue.
-	 */
-	if (unlikely(blk_queue_bypass(q)))
-		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
-
 	return __blkg_lookup(blkcg, q, true /* update_hint */);
 }
 
diff --git a/block/blk-core.c b/block/blk-core.c
index fdc0ad2686c4..1c9b6975cf0a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -370,18 +370,6 @@ void blk_cleanup_queue(struct request_queue *q)
 	blk_set_queue_dying(q);
 	spin_lock_irq(lock);
 
-	/*
-	 * A dying queue is permanently in bypass mode till released.  Note
-	 * that, unlike blk_queue_bypass_start(), we aren't performing
-	 * synchronize_rcu() after entering bypass mode to avoid the delay
-	 * as some drivers create and destroy a lot of queues while
-	 * probing.  This is still safe because blk_release_queue() will be
-	 * called only after the queue refcnt drops to zero and nothing,
-	 * RCU or not, would be traversing the queue by then.
-	 */
-	q->bypass_depth++;
-	queue_flag_set(QUEUE_FLAG_BYPASS, q);
-
 	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 	queue_flag_set(QUEUE_FLAG_DYING, q);
@@ -589,15 +577,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 
 	q->queue_lock = lock ? : &q->__queue_lock;
 
-	/*
-	 * A queue starts its life with bypass turned on to avoid
-	 * unnecessary bypass on/off overhead and nasty surprises during
-	 * init.  The initial bypass will be finished when the queue is
-	 * registered by blk_register_queue().
-	 */
-	q->bypass_depth = 1;
-	queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q);
-
 	init_waitqueue_head(&q->mq_freeze_wq);
 
 	/*
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index f021f4817b80..a32bb79d6c95 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -114,7 +114,6 @@ static int queue_pm_only_show(void *data, struct seq_file *m)
 static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(STOPPED),
 	QUEUE_FLAG_NAME(DYING),
-	QUEUE_FLAG_NAME(BYPASS),
 	QUEUE_FLAG_NAME(BIDI),
 	QUEUE_FLAG_NAME(NOMERGES),
 	QUEUE_FLAG_NAME(SAME_COMP),
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index db1a3a2ae006..8e6f3c9821c2 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2145,9 +2145,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	throtl_update_latency_buckets(td);
 
-	if (unlikely(blk_queue_bypass(q)))
-		goto out_unlock;
-
 	blk_throtl_assoc_bio(tg, bio);
 	blk_throtl_update_idletime(tg);
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 1b299e025e83..2c68efc603bd 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -325,16 +325,12 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
  * @q: request_queue of interest
  *
  * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
+ * under RCU read loc.
  */
 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 					   struct request_queue *q)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (unlikely(blk_queue_bypass(q)))
-		return NULL;
 	return __blkg_lookup(blkcg, q, false);
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c961329be96b..dd1e53fd4acf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -548,7 +548,6 @@ struct request_queue {
 
 	struct mutex		sysfs_lock;
 
-	int			bypass_depth;
 	atomic_t		mq_freeze_depth;
 
 #if defined(CONFIG_BLK_DEV_BSG)
@@ -586,7 +585,6 @@ struct request_queue {
 
 #define QUEUE_FLAG_STOPPED	1	/* queue is stopped */
 #define QUEUE_FLAG_DYING	2	/* queue being torn down */
-#define QUEUE_FLAG_BYPASS	3	/* act as dumb FIFO queue */
 #define QUEUE_FLAG_BIDI		4	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     5	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	6	/* complete on same CPU-group */
@@ -630,7 +628,6 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
-#define blk_queue_bypass(q)	test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
 #define blk_queue_init_done(q)	test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\

From 079076b3416e78ba2bb3ce38e05e320c388c3120 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:05 +0100
Subject: [PATCH 109/351] block: remove deadline __deadline manipulation
 helpers

No users left since the removal of the legacy request interface, we can
remove all the magic bit stealing now and make it a normal field.

But use WRITE_ONCE/READ_ONCE on the new deadline field, given that we
don't seem to have any mechanism to guarantee a new value actually
gets seen by other threads.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  4 ++--
 block/blk-timeout.c    |  8 +++++---
 block/blk.h            | 35 -----------------------------------
 include/linux/blkdev.h |  4 +---
 4 files changed, 8 insertions(+), 43 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 411be60d0cb6..4c82b4b4fa3e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -325,7 +325,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->special = NULL;
 	/* tag was already set */
 	rq->extra_len = 0;
-	rq->__deadline = 0;
+	WRITE_ONCE(rq->deadline, 0);
 
 	rq->timeout = 0;
 
@@ -839,7 +839,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
 	if (rq->rq_flags & RQF_TIMED_OUT)
 		return false;
 
-	deadline = blk_rq_deadline(rq);
+	deadline = READ_ONCE(rq->deadline);
 	if (time_after_eq(jiffies, deadline))
 		return true;
 
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 006cff4390c0..3b0179fbdd6a 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -84,7 +84,7 @@ void blk_abort_request(struct request *req)
 	 * immediately and that scan sees the new timeout value.
 	 * No need for fancy synchronizations.
 	 */
-	blk_rq_set_deadline(req, jiffies);
+	WRITE_ONCE(req->deadline, jiffies);
 	kblockd_schedule_work(&req->q->timeout_work);
 }
 EXPORT_SYMBOL_GPL(blk_abort_request);
@@ -121,14 +121,16 @@ void blk_add_timer(struct request *req)
 		req->timeout = q->rq_timeout;
 
 	req->rq_flags &= ~RQF_TIMED_OUT;
-	blk_rq_set_deadline(req, jiffies + req->timeout);
+
+	expiry = jiffies + req->timeout;
+	WRITE_ONCE(req->deadline, expiry);
 
 	/*
 	 * If the timer isn't already pending or this timeout is earlier
 	 * than an existing one, modify the timer. Round up to next nearest
 	 * second.
 	 */
-	expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req)));
+	expiry = blk_rq_timeout(round_jiffies_up(expiry));
 
 	if (!timer_pending(&q->timeout) ||
 	    time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk.h b/block/blk.h
index 41b64e6e101b..08a5845b03ba 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -238,26 +238,6 @@ void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
 void blk_account_io_done(struct request *req, u64 now);
 
-/*
- * EH timer and IO completion will both attempt to 'grab' the request, make
- * sure that only one of them succeeds. Steal the bottom bit of the
- * __deadline field for this.
- */
-static inline int blk_mark_rq_complete(struct request *rq)
-{
-	return test_and_set_bit(0, &rq->__deadline);
-}
-
-static inline void blk_clear_rq_complete(struct request *rq)
-{
-	clear_bit(0, &rq->__deadline);
-}
-
-static inline bool blk_rq_is_complete(struct request *rq)
-{
-	return test_bit(0, &rq->__deadline);
-}
-
 /*
  * Internal elevator interface
  */
@@ -322,21 +302,6 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
-/*
- * Steal a bit from this field for legacy IO path atomic IO marking. Note that
- * setting the deadline clears the bottom bit, potentially clearing the
- * completed bit. The user has to be OK with this (current ones are fine).
- */
-static inline void blk_rq_set_deadline(struct request *rq, unsigned long time)
-{
-	rq->__deadline = time & ~0x1UL;
-}
-
-static inline unsigned long blk_rq_deadline(struct request *rq)
-{
-	return rq->__deadline & ~0x1UL;
-}
-
 /*
  * Internal io_context interface
  */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dd1e53fd4acf..60507ab7b358 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -224,9 +224,7 @@ struct request {
 	refcount_t ref;
 
 	unsigned int timeout;
-
-	/* access through blk_rq_set_deadline, blk_rq_deadline */
-	unsigned long __deadline;
+	unsigned long deadline;
 
 	union {
 		struct __call_single_data csd;

From 39795d6534c6e698c4f9c065e0a5f4a2e5af7543 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:06 +0100
Subject: [PATCH 110/351] block: don't hold the queue_lock over
 blk_abort_request

There is nothing it could synchronize against, so don't go through
the pains of acquiring the lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-timeout.c                 |  2 +-
 drivers/ata/libata-eh.c             |  4 ----
 drivers/block/mtip32xx/mtip32xx.c   |  5 +----
 drivers/scsi/libsas/sas_ata.c       |  5 -----
 drivers/scsi/libsas/sas_scsi_host.c | 10 ++--------
 5 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 3b0179fbdd6a..124c26128bf6 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -75,7 +75,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
  * This function requests that the block layer start recovery for the
  * request by deleting the timer and calling the q's timeout function.
  * LLDDs who implement their own error recovery MAY ignore the timeout
- * event if they generated blk_abort_req. Must hold queue lock.
+ * event if they generated blk_abort_request.
  */
 void blk_abort_request(struct request *req)
 {
diff --git a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
index 01306c018398..938ed513b070 100644
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -919,8 +919,6 @@ static void ata_eh_set_pending(struct ata_port *ap, int fastdrain)
 void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-	struct request_queue *q = qc->scsicmd->device->request_queue;
-	unsigned long flags;
 
 	WARN_ON(!ap->ops->error_handler);
 
@@ -932,9 +930,7 @@ void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
 	 * Note that ATA_QCFLAG_FAILED is unconditionally set after
 	 * this function completes.
 	 */
-	spin_lock_irqsave(q->queue_lock, flags);
 	blk_abort_request(qc->scsicmd->request);
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /**
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a4c44db097e0..2b0ac9d01e51 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2770,10 +2770,7 @@ restart_eh:
 
 			blk_mq_quiesce_queue(dd->queue);
 
-			spin_lock(dd->queue->queue_lock);
-			blk_mq_tagset_busy_iter(&dd->tags,
-							mtip_queue_cmd, dd);
-			spin_unlock(dd->queue->queue_lock);
+			blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd);
 
 			set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags);
 
diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c
index 4f6cdf53e913..c90b278cc28c 100644
--- a/drivers/scsi/libsas/sas_ata.c
+++ b/drivers/scsi/libsas/sas_ata.c
@@ -601,12 +601,7 @@ void sas_ata_task_abort(struct sas_task *task)
 
 	/* Bounce SCSI-initiated commands to the SCSI EH */
 	if (qc->scsicmd) {
-		struct request_queue *q = qc->scsicmd->device->request_queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
 		blk_abort_request(qc->scsicmd->request);
-		spin_unlock_irqrestore(q->queue_lock, flags);
 		return;
 	}
 
diff --git a/drivers/scsi/libsas/sas_scsi_host.c b/drivers/scsi/libsas/sas_scsi_host.c
index 33229348dcb6..af085432c5fe 100644
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -930,16 +930,10 @@ void sas_task_abort(struct sas_task *task)
 		return;
 	}
 
-	if (dev_is_sata(task->dev)) {
+	if (dev_is_sata(task->dev))
 		sas_ata_task_abort(task);
-	} else {
-		struct request_queue *q = sc->device->request_queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
+	else
 		blk_abort_request(sc->request);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
 }
 
 void sas_target_destroy(struct scsi_target *starget)

From 57d74df90783f6a6b3e79dfdd2a567ce5db3b790 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:07 +0100
Subject: [PATCH 111/351] block: use atomic bitops for ->queue_flags

->queue_flags is generally not set or cleared in the fast path, and also
generally set or cleared one flag at a time.  Make use of the normal
atomic bitops for it so that we don't need to take the queue_lock,
which is otherwise mostly unused in the core block layer now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 54 ++++++----------------------------------
 block/blk-mq.c         |  2 +-
 block/blk-settings.c   | 10 +++-----
 block/blk-sysfs.c      | 28 +++++++++------------
 block/blk.h            | 56 ------------------------------------------
 include/linux/blkdev.h |  1 -
 6 files changed, 24 insertions(+), 127 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1c9b6975cf0a..5c8e66a09d82 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -74,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue;
  */
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_set(flag, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	set_bit(flag, &q->queue_flags);
 }
 EXPORT_SYMBOL(blk_queue_flag_set);
 
@@ -89,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set);
  */
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	queue_flag_clear(flag, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	clear_bit(flag, &q->queue_flags);
 }
 EXPORT_SYMBOL(blk_queue_flag_clear);
 
@@ -107,38 +99,10 @@ EXPORT_SYMBOL(blk_queue_flag_clear);
  */
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
 {
-	unsigned long flags;
-	bool res;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	res = queue_flag_test_and_set(flag, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return res;
+	return test_and_set_bit(flag, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
 
-/**
- * blk_queue_flag_test_and_clear - atomically test and clear a queue flag
- * @flag: flag to be cleared
- * @q: request queue
- *
- * Returns the previous value of @flag - 0 if the flag was not set and 1 if
- * the flag was set.
- */
-bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q)
-{
-	unsigned long flags;
-	bool res;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-	res = queue_flag_test_and_clear(flag, q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-
-	return res;
-}
-EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear);
-
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
@@ -368,12 +332,10 @@ void blk_cleanup_queue(struct request_queue *q)
 	/* mark @q DYING, no new request or merges will be allowed afterwards */
 	mutex_lock(&q->sysfs_lock);
 	blk_set_queue_dying(q);
-	spin_lock_irq(lock);
 
-	queue_flag_set(QUEUE_FLAG_NOMERGES, q);
-	queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-	queue_flag_set(QUEUE_FLAG_DYING, q);
-	spin_unlock_irq(lock);
+	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+	blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+	blk_queue_flag_set(QUEUE_FLAG_DYING, q);
 	mutex_unlock(&q->sysfs_lock);
 
 	/*
@@ -384,9 +346,7 @@ void blk_cleanup_queue(struct request_queue *q)
 
 	rq_qos_exit(q);
 
-	spin_lock_irq(lock);
-	queue_flag_set(QUEUE_FLAG_DEAD, q);
-	spin_unlock_irq(lock);
+	blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
 
 	/*
 	 * make sure all in-progress dispatch are completed because
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c82b4b4fa3e..e2717e843727 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2756,7 +2756,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
-		queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q);
+		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
 
 	q->sg_reserved_size = INT_MAX;
 
diff --git a/block/blk-settings.c b/block/blk-settings.c
index cca83590a1dc..3abe831e92c8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -834,16 +834,14 @@ EXPORT_SYMBOL(blk_set_queue_depth);
  */
 void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 {
-	spin_lock_irq(q->queue_lock);
 	if (wc)
-		queue_flag_set(QUEUE_FLAG_WC, q);
+		blk_queue_flag_set(QUEUE_FLAG_WC, q);
 	else
-		queue_flag_clear(QUEUE_FLAG_WC, q);
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 	if (fua)
-		queue_flag_set(QUEUE_FLAG_FUA, q);
+		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
 	else
-		queue_flag_clear(QUEUE_FLAG_FUA, q);
-	spin_unlock_irq(q->queue_lock);
+		blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
 
 	wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index d4b1b84ba8ca..22fd086eba9f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -316,14 +316,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page,
 	if (ret < 0)
 		return ret;
 
-	spin_lock_irq(q->queue_lock);
-	queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
-	queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
+	blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q);
+	blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q);
 	if (nm == 2)
-		queue_flag_set(QUEUE_FLAG_NOMERGES, q);
+		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
 	else if (nm)
-		queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
-	spin_unlock_irq(q->queue_lock);
+		blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
 
 	return ret;
 }
@@ -347,18 +345,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 	if (ret < 0)
 		return ret;
 
-	spin_lock_irq(q->queue_lock);
 	if (val == 2) {
-		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-		queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+		blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+		blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
 	} else if (val == 1) {
-		queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+		blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
+		blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
 	} else if (val == 0) {
-		queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
-		queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+		blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+		blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
 	}
-	spin_unlock_irq(q->queue_lock);
 #endif
 	return ret;
 }
@@ -889,7 +885,7 @@ int blk_register_queue(struct gendisk *disk)
 	WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
 		  "%s is registering an already registered queue\n",
 		  kobject_name(&dev->kobj));
-	queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q);
+	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 
 	/*
 	 * SCSI probing may synchronously create and destroy a lot of
@@ -901,7 +897,7 @@ int blk_register_queue(struct gendisk *disk)
 	 * request_queues for non-existent devices never get registered.
 	 */
 	if (!blk_queue_init_done(q)) {
-		queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
+		blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
 		percpu_ref_switch_to_percpu(&q->q_usage_counter);
 	}
 
diff --git a/block/blk.h b/block/blk.h
index 08a5845b03ba..f2ddc71e93da 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -48,62 +48,6 @@ static inline void queue_lockdep_assert_held(struct request_queue *q)
 		lockdep_assert_held(q->queue_lock);
 }
 
-static inline void queue_flag_set_unlocked(unsigned int flag,
-					   struct request_queue *q)
-{
-	if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
-	    kref_read(&q->kobj.kref))
-		lockdep_assert_held(q->queue_lock);
-	__set_bit(flag, &q->queue_flags);
-}
-
-static inline void queue_flag_clear_unlocked(unsigned int flag,
-					     struct request_queue *q)
-{
-	if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) &&
-	    kref_read(&q->kobj.kref))
-		lockdep_assert_held(q->queue_lock);
-	__clear_bit(flag, &q->queue_flags);
-}
-
-static inline int queue_flag_test_and_clear(unsigned int flag,
-					    struct request_queue *q)
-{
-	queue_lockdep_assert_held(q);
-
-	if (test_bit(flag, &q->queue_flags)) {
-		__clear_bit(flag, &q->queue_flags);
-		return 1;
-	}
-
-	return 0;
-}
-
-static inline int queue_flag_test_and_set(unsigned int flag,
-					  struct request_queue *q)
-{
-	queue_lockdep_assert_held(q);
-
-	if (!test_bit(flag, &q->queue_flags)) {
-		__set_bit(flag, &q->queue_flags);
-		return 0;
-	}
-
-	return 1;
-}
-
-static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
-{
-	queue_lockdep_assert_held(q);
-	__set_bit(flag, &q->queue_flags);
-}
-
-static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
-{
-	queue_lockdep_assert_held(q);
-	__clear_bit(flag, &q->queue_flags);
-}
-
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 60507ab7b358..30d8e0fbd104 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -621,7 +621,6 @@ struct request_queue {
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
-bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q);
 
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dying(q)	test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)

From 373e4af34ec13c17a6b80227c7d5d3719122eb77 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:08 +0100
Subject: [PATCH 112/351] block: remove queue_lockdep_assert_held

The only remaining user unconditionally drops and reacquires the lock,
which means we really don't need any additional (conditional) annotation.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-throttle.c |  1 -
 block/blk.h          | 13 -------------
 2 files changed, 14 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8e6f3c9821c2..a665b0950369 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2353,7 +2353,6 @@ void blk_throtl_drain(struct request_queue *q)
 	struct bio *bio;
 	int rw;
 
-	queue_lockdep_assert_held(q);
 	rcu_read_lock();
 
 	/*
diff --git a/block/blk.h b/block/blk.h
index f2ddc71e93da..027a0ccc175e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -35,19 +35,6 @@ extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
-/*
- * @q->queue_lock is set while a queue is being initialized. Since we know
- * that no other threads access the queue object before @q->queue_lock has
- * been set, it is safe to manipulate queue flags without holding the
- * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and
- * blk_init_allocated_queue().
- */
-static inline void queue_lockdep_assert_held(struct request_queue *q)
-{
-	if (q->queue_lock)
-		lockdep_assert_held(q->queue_lock);
-}
-
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {

From d53375608ebf13c37721cf30677eba4333d18020 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:09 +0100
Subject: [PATCH 113/351] block: remove the unused lock argument to
 rq_qos_throttle

Unused now that the legacy request path is gone.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 24 ++++++------------------
 block/blk-mq.c        |  2 +-
 block/blk-rq-qos.c    |  5 ++---
 block/blk-rq-qos.h    |  4 ++--
 block/blk-wbt.c       | 16 ++++------------
 5 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 38c35c32aff2..8edf1b353ad1 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -276,10 +276,8 @@ static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
 
 static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
 				       struct iolatency_grp *iolat,
-				       spinlock_t *lock, bool issue_as_root,
+				       bool issue_as_root,
 				       bool use_memdelay)
-	__releases(lock)
-	__acquires(lock)
 {
 	struct rq_wait *rqw = &iolat->rq_wait;
 	unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
@@ -311,14 +309,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
 		if (iolatency_may_queue(iolat, &wait, first_block))
 			break;
 		first_block = false;
-
-		if (lock) {
-			spin_unlock_irq(lock);
-			io_schedule();
-			spin_lock_irq(lock);
-		} else {
-			io_schedule();
-		}
+		io_schedule();
 	} while (1);
 
 	finish_wait(&rqw->wait, &wait);
@@ -478,8 +469,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
 	scale_change(iolat, direction > 0);
 }
 
-static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
-				     spinlock_t *lock)
+static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 {
 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
 	struct blkcg *blkcg;
@@ -495,13 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio,
 	bio_associate_blkcg(bio, &blkcg->css);
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
-		if (!lock)
-			spin_lock_irq(q->queue_lock);
+		spin_lock_irq(q->queue_lock);
 		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
-		if (!lock)
-			spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(q->queue_lock);
 	}
 	if (!blkg)
 		goto out;
@@ -518,7 +506,7 @@ out:
 		}
 
 		check_scale_change(iolat);
-		__blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root,
+		__blkcg_iolatency_throttle(rqos, iolat, issue_as_root,
 				     (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
 		blkg = blkg->parent;
 	}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e2717e843727..a3f057fdd045 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1886,7 +1886,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	if (blk_mq_sched_bio_merge(q, bio))
 		return BLK_QC_T_NONE;
 
-	rq_qos_throttle(q, bio, NULL);
+	rq_qos_throttle(q, bio);
 
 	rq = blk_mq_get_request(q, bio, &data);
 	if (unlikely(!rq)) {
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 0005dfd568dd..f8a4d3fbb98c 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -67,14 +67,13 @@ void rq_qos_requeue(struct request_queue *q, struct request *rq)
 	}
 }
 
-void rq_qos_throttle(struct request_queue *q, struct bio *bio,
-		     spinlock_t *lock)
+void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 {
 	struct rq_qos *rqos;
 
 	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
 		if (rqos->ops->throttle)
-			rqos->ops->throttle(rqos, bio, lock);
+			rqos->ops->throttle(rqos, bio);
 	}
 }
 
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 32b02efbfa66..b6b11d496007 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -25,7 +25,7 @@ struct rq_qos {
 };
 
 struct rq_qos_ops {
-	void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *);
+	void (*throttle)(struct rq_qos *, struct bio *);
 	void (*track)(struct rq_qos *, struct request *, struct bio *);
 	void (*issue)(struct rq_qos *, struct request *);
 	void (*requeue)(struct rq_qos *, struct request *);
@@ -103,7 +103,7 @@ void rq_qos_done(struct request_queue *, struct request *);
 void rq_qos_issue(struct request_queue *, struct request *);
 void rq_qos_requeue(struct request_queue *, struct request *);
 void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
-void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *);
+void rq_qos_throttle(struct request_queue *, struct bio *);
 void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
 void rq_qos_exit(struct request_queue *);
 #endif
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0fc222d4194b..e5a66c574683 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -521,9 +521,7 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
  * the timer to kick off queuing again.
  */
 static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
-		       unsigned long rw, spinlock_t *lock)
-	__releases(lock)
-	__acquires(lock)
+		       unsigned long rw)
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
 	struct wbt_wait_data data = {
@@ -561,13 +559,7 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 			break;
 		}
 
-		if (lock) {
-			spin_unlock_irq(lock);
-			io_schedule();
-			spin_lock_irq(lock);
-		} else
-			io_schedule();
-
+		io_schedule();
 		has_sleeper = false;
 	} while (1);
 
@@ -624,7 +616,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
  * in an irq held spinlock, if it holds one when calling this function.
  * If we do sleep, we'll release and re-grab it.
  */
-static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
+static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
 {
 	struct rq_wb *rwb = RQWB(rqos);
 	enum wbt_flags flags;
@@ -636,7 +628,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock)
 		return;
 	}
 
-	__wbt_wait(rwb, flags, bio->bi_opf, lock);
+	__wbt_wait(rwb, flags, bio->bi_opf);
 
 	if (!blk_stat_is_active(rwb->cb))
 		rwb_arm_timer(rwb);

From 9809b4eed2cf3718ea160f70693b1714768f9822 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:10 +0100
Subject: [PATCH 114/351] block: update a few comments for the legacy request
 removal

Only the mq locking is left in the flush state machine.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-flush.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index c53197dcdd70..fcd18b158fd6 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -148,7 +148,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
  * completion and trigger the next step.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
+ * spin_lock_irq(fq->mq_flush_lock)
  *
  * RETURNS:
  * %true if requests were added to the dispatch queue, %false otherwise.
@@ -252,7 +252,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
  * Please read the comment at the top of this file for more info.
  *
  * CONTEXT:
- * spin_lock_irq(q->queue_lock or fq->mq_flush_lock)
+ * spin_lock_irq(fq->mq_flush_lock)
  *
  */
 static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,

From b6676f653f13f83582985bc713525a48d735b2a3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:11 +0100
Subject: [PATCH 115/351] block: remove a few unused exports

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c   | 6 ------
 block/blk-ioc.c      | 3 ---
 block/blk-mq-sysfs.c | 1 -
 block/blk-softirq.c  | 1 -
 block/blk-stat.c     | 4 ----
 block/blk-wbt.c      | 2 --
 6 files changed, 17 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index a95cddb39f1c..3296c0b7353a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -424,7 +424,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg)
 		return dev_name(blkg->q->backing_dev_info->dev);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -860,7 +859,6 @@ fail:
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
 /**
  * blkg_conf_finish - finish up per-blkg config update
@@ -876,7 +874,6 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	rcu_read_unlock();
 	put_disk_and_module(ctx->disk);
 }
-EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
 static int blkcg_print_stat(struct seq_file *sf, void *v)
 {
@@ -1691,7 +1688,6 @@ out:
 	rcu_read_unlock();
 	blk_put_queue(q);
 }
-EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current);
 
 /**
  * blkcg_schedule_throttle - this task needs to check for throttling
@@ -1725,7 +1721,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
 		current->use_memdelay = use_memdelay;
 	set_notify_resume(current);
 }
-EXPORT_SYMBOL_GPL(blkcg_schedule_throttle);
 
 /**
  * blkcg_add_delay - add delay to this blkg
@@ -1740,7 +1735,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
 	blkcg_scale_delay(blkg, now);
 	atomic64_add(delta, &blkg->delay_nsec);
 }
-EXPORT_SYMBOL_GPL(blkcg_add_delay);
 
 module_param(blkcg_debug_stats, bool, 0644);
 MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 56755ad5ac88..f91ca6b70d6a 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc)
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 	atomic_long_inc(&ioc->refcount);
 }
-EXPORT_SYMBOL(get_io_context);
 
 static void icq_free_icq_rcu(struct rcu_head *head)
 {
@@ -160,7 +159,6 @@ void put_io_context(struct io_context *ioc)
 	if (free_ioc)
 		kmem_cache_free(iocontext_cachep, ioc);
 }
-EXPORT_SYMBOL(put_io_context);
 
 /**
  * put_io_context_active - put active reference on ioc
@@ -315,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task,
 
 	return NULL;
 }
-EXPORT_SYMBOL(get_task_io_context);
 
 /**
  * ioc_lookup_icq - lookup io_cq from ioc
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 2d737f9e7ba7..3d25b9c419e9 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -350,7 +350,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(blk_mq_register_dev);
 
 void blk_mq_sysfs_unregister(struct request_queue *q)
 {
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 1534066e306e..457d9ba3eb20 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -143,7 +143,6 @@ do_local:
 
 	local_irq_restore(flags);
 }
-EXPORT_SYMBOL(__blk_complete_request);
 
 static __init int blk_softirq_init(void)
 {
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 90561af85a62..696a04176e4d 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *),
 
 	return cb;
 }
-EXPORT_SYMBOL_GPL(blk_stat_alloc_callback);
 
 void blk_stat_add_callback(struct request_queue *q,
 			   struct blk_stat_callback *cb)
@@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q,
 	blk_queue_flag_set(QUEUE_FLAG_STATS, q);
 	spin_unlock(&q->stats->lock);
 }
-EXPORT_SYMBOL_GPL(blk_stat_add_callback);
 
 void blk_stat_remove_callback(struct request_queue *q,
 			      struct blk_stat_callback *cb)
@@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q,
 
 	del_timer_sync(&cb->timer);
 }
-EXPORT_SYMBOL_GPL(blk_stat_remove_callback);
 
 static void blk_stat_free_callback_rcu(struct rcu_head *head)
 {
@@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
 	if (cb)
 		call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
-EXPORT_SYMBOL_GPL(blk_stat_free_callback);
 
 void blk_stat_enable_accounting(struct request_queue *q)
 {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index e5a66c574683..919444d75489 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -754,8 +754,6 @@ void wbt_disable_default(struct request_queue *q)
 	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
 		rwb->wb_normal = 0;
 }
-EXPORT_SYMBOL_GPL(wbt_disable_default);
-
 
 static struct rq_qos_ops wbt_rqos_ops = {
 	.throttle = wbt_wait,

From 04be60b5e4e03fd3d58f7f25b782240a45272fb0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:12 +0100
Subject: [PATCH 116/351] blk-cgroup: consolidate error handling in
 blkcg_init_queue

Use a goto label to merge two identical pieces of error handling code.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3296c0b7353a..717ab38a6c67 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1173,21 +1173,19 @@ int blkcg_init_queue(struct request_queue *q)
 		radix_tree_preload_end();
 
 	ret = blk_iolatency_init(q);
-	if (ret) {
-		spin_lock_irq(q->queue_lock);
-		blkg_destroy_all(q);
-		spin_unlock_irq(q->queue_lock);
-		return ret;
-	}
+	if (ret)
+		goto err_destroy_all;
 
 	ret = blk_throtl_init(q);
-	if (ret) {
-		spin_lock_irq(q->queue_lock);
-		blkg_destroy_all(q);
-		spin_unlock_irq(q->queue_lock);
-	}
-	return ret;
+	if (ret)
+		goto err_destroy_all;
+	return 0;
 
+err_destroy_all:
+	spin_lock_irq(q->queue_lock);
+	blkg_destroy_all(q);
+	spin_unlock_irq(q->queue_lock);
+	return ret;
 err_unlock:
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();

From 7fb1763de6f8b4d6e7a69a08a14ce5496500f2c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:13 +0100
Subject: [PATCH 117/351] blk-cgroup: move locking into blkg_destroy_all

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 717ab38a6c67..3ba23b9bfeb9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -349,8 +349,7 @@ static void blkg_destroy_all(struct request_queue *q)
 {
 	struct blkcg_gq *blkg, *n;
 
-	lockdep_assert_held(q->queue_lock);
-
+	spin_lock_irq(q->queue_lock);
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
@@ -360,6 +359,7 @@ static void blkg_destroy_all(struct request_queue *q)
 	}
 
 	q->root_blkg = NULL;
+	spin_unlock_irq(q->queue_lock);
 }
 
 /*
@@ -1182,9 +1182,7 @@ int blkcg_init_queue(struct request_queue *q)
 	return 0;
 
 err_destroy_all:
-	spin_lock_irq(q->queue_lock);
 	blkg_destroy_all(q);
-	spin_unlock_irq(q->queue_lock);
 	return ret;
 err_unlock:
 	spin_unlock_irq(q->queue_lock);
@@ -1222,10 +1220,7 @@ void blkcg_drain_queue(struct request_queue *q)
  */
 void blkcg_exit_queue(struct request_queue *q)
 {
-	spin_lock_irq(q->queue_lock);
 	blkg_destroy_all(q);
-	spin_unlock_irq(q->queue_lock);
-
 	blk_throtl_exit(q);
 }
 

From 8295a69bdc3cb8707e645f9b2de6f3019a521882 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:14 +0100
Subject: [PATCH 118/351] drbd: don't override the queue_lock

The DRBD req_lock and block layer queue_lock are used for entirely
different resources.  Stop using the req_lock as the block layer
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa8204214ac0..b66c59ce6260 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	drbd_init_set_defaults(device);
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, &resource->req_lock);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
 	if (!q)
 		goto out_no_q;
 	device->rq_queue = q;

From 68fc68f2ff620852ee43ee7a2831bc5eeb9472d6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:15 +0100
Subject: [PATCH 119/351] umem: don't override the queue_lock

The umem card->lock and the block layer queue_lock are used for entirely
different resources.  Stop using card->lock as the block layer
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/umem.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index be3e3ab79950..8a27b5adc2b3 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,8 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	card->biotail = &card->bio;
 	spin_lock_init(&card->lock);
 
-	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE,
-					   &card->lock);
+	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
 	if (!card->queue)
 		goto failed_alloc;
 

From b061b326287d45aeaf313f7dddd02e88e31db14b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:16 +0100
Subject: [PATCH 120/351] mmc: simplify queue initialization

Merge three functions initializing the queue into a single one, and drop
an unused argument for it.

Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c |  2 +-
 drivers/mmc/core/queue.c | 85 ++++++++++++++--------------------------
 drivers/mmc/core/queue.h |  3 +-
 3 files changed, 31 insertions(+), 59 deletions(-)

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index c35b5b08bb33..27606e1382e5 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -2334,7 +2334,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 	INIT_LIST_HEAD(&md->rpmbs);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock, subname);
+	ret = mmc_init_queue(&md->queue, card, &md->lock);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 6edffeed9953..26ad1f571677 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -378,14 +378,37 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 	init_waitqueue_head(&mq->wait);
 }
 
-static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
-			     const struct blk_mq_ops *mq_ops, spinlock_t *lock)
+/* Set queue depth to get a reasonable value for q->nr_requests */
+#define MMC_QUEUE_DEPTH 64
+
+/**
+ * mmc_init_queue - initialise a queue structure.
+ * @mq: mmc queue
+ * @card: mmc card to attach this queue
+ * @lock: queue lock
+ *
+ * Initialise a MMC card request queue.
+ */
+int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
+		   spinlock_t *lock)
 {
+	struct mmc_host *host = card->host;
 	int ret;
 
+	mq->card = card;
+	mq->use_cqe = host->cqe_enabled;
+
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
-	mq->tag_set.ops = mq_ops;
-	mq->tag_set.queue_depth = q_depth;
+	mq->tag_set.ops = &mmc_mq_ops;
+	/*
+	 * The queue depth for CQE must match the hardware because the request
+	 * tag is used to index the hardware queue.
+	 */
+	if (mq->use_cqe)
+		mq->tag_set.queue_depth =
+			min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
+	else
+		mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
 	mq->tag_set.numa_node = NUMA_NO_NODE;
 	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
 			    BLK_MQ_F_BLOCKING;
@@ -405,66 +428,16 @@ static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
 
 	mq->queue->queue_lock = lock;
 	mq->queue->queuedata = mq;
+	blk_queue_rq_timeout(mq->queue, 60 * HZ);
 
+	mmc_setup_queue(mq, card);
 	return 0;
 
 free_tag_set:
 	blk_mq_free_tag_set(&mq->tag_set);
-
 	return ret;
 }
 
-/* Set queue depth to get a reasonable value for q->nr_requests */
-#define MMC_QUEUE_DEPTH 64
-
-static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
-			 spinlock_t *lock)
-{
-	struct mmc_host *host = card->host;
-	int q_depth;
-	int ret;
-
-	/*
-	 * The queue depth for CQE must match the hardware because the request
-	 * tag is used to index the hardware queue.
-	 */
-	if (mq->use_cqe)
-		q_depth = min_t(int, card->ext_csd.cmdq_depth, host->cqe_qdepth);
-	else
-		q_depth = MMC_QUEUE_DEPTH;
-
-	ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
-	if (ret)
-		return ret;
-
-	blk_queue_rq_timeout(mq->queue, 60 * HZ);
-
-	mmc_setup_queue(mq, card);
-
-	return 0;
-}
-
-/**
- * mmc_init_queue - initialise a queue structure.
- * @mq: mmc queue
- * @card: mmc card to attach this queue
- * @lock: queue lock
- * @subname: partition subname
- *
- * Initialise a MMC card request queue.
- */
-int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock, const char *subname)
-{
-	struct mmc_host *host = card->host;
-
-	mq->card = card;
-
-	mq->use_cqe = host->cqe_enabled;
-
-	return mmc_mq_init(mq, card, lock);
-}
-
 void mmc_queue_suspend(struct mmc_queue *mq)
 {
 	blk_mq_quiesce_queue(mq->queue);
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 9bf3c9245075..29218e12900d 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -95,8 +95,7 @@ struct mmc_queue {
 	struct work_struct	complete_work;
 };
 
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
-			  const char *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);

From 310df020cdd7570e1a8ee43bd58999a743686eda Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:17 +0100
Subject: [PATCH 121/351] mmc: stop abusing the request queue_lock pointer

mmc uses the block layer struct request pointer to indirect their own
lock to the mmc_queue structure, given that the original lock isn't
reachable outside of block.c.  Add a lock pointer to struct mmc_queue
instead and stop overriding the block layer lock which protects fields
entirely separate from the mmc use.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c | 22 ++++++++++------------
 drivers/mmc/core/queue.c | 26 +++++++++++++-------------
 drivers/mmc/core/queue.h |  1 +
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 27606e1382e5..70ec465beb69 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1483,7 +1483,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 		blk_mq_end_request(req, BLK_STS_OK);
 	}
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
@@ -1491,7 +1491,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 
 	mmc_cqe_check_busy(mq);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	if (!mq->cqe_busy)
 		blk_mq_run_hw_queues(q, true);
@@ -1988,17 +1988,16 @@ static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
 
 static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
 {
-	struct request_queue *q = req->q;
 	unsigned long flags;
 	bool put_card;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
 	put_card = (mmc_tot_in_flight(mq) == 0);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	if (put_card)
 		mmc_put_card(mq->card, &mq->ctx);
@@ -2094,11 +2093,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 		 * request does not need to wait (although it does need to
 		 * complete complete_req first).
 		 */
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(mq->lock, flags);
 		mq->complete_req = req;
 		mq->rw_wait = false;
 		waiting = mq->waiting;
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(mq->lock, flags);
 
 		/*
 		 * If 'waiting' then the waiting task will complete this
@@ -2117,10 +2116,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 	/* Take the recovery path for errors or urgent background operations */
 	if (mmc_blk_rq_error(&mqrq->brq) ||
 	    mmc_blk_urgent_bkops_needed(mq, mqrq)) {
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(mq->lock, flags);
 		mq->recovery_needed = true;
 		mq->recovery_req = req;
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(mq->lock, flags);
 		wake_up(&mq->wait);
 		schedule_work(&mq->recovery_work);
 		return;
@@ -2136,7 +2135,6 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 
 static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 {
-	struct request_queue *q = mq->queue;
 	unsigned long flags;
 	bool done;
 
@@ -2144,7 +2142,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 	 * Wait while there is another request in progress, but not if recovery
 	 * is needed. Also indicate whether there is a request waiting to start.
 	 */
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 	if (mq->recovery_needed) {
 		*err = -EBUSY;
 		done = true;
@@ -2152,7 +2150,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 		done = !mq->rw_wait;
 	}
 	mq->waiting = !done;
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	return done;
 }
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 26ad1f571677..4485cf12218c 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
 	struct mmc_queue *mq = q->queuedata;
 	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 	__mmc_cqe_recovery_notifier(mq);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 }
 
 static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(mq->lock, flags);
 
 	if (mq->recovery_needed || !mq->use_cqe)
 		ret = BLK_EH_RESET_TIMER;
 	else
 		ret = mmc_cqe_timed_out(req);
 
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(mq->lock, flags);
 
 	return ret;
 }
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
 
 	mq->in_recovery = false;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(mq->lock);
 	mq->recovery_needed = false;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(mq->lock);
 
 	mmc_put_card(mq->card, &mq->ctx);
 
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	issue_type = mmc_issue_type(mq, req);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(mq->lock);
 
 	if (mq->recovery_needed || mq->busy) {
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(mq->lock);
 		return BLK_STS_RESOURCE;
 	}
 
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case MMC_ISSUE_DCMD:
 		if (mmc_cqe_dcmd_busy(mq)) {
 			mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
-			spin_unlock_irq(q->queue_lock);
+			spin_unlock_irq(mq->lock);
 			return BLK_STS_RESOURCE;
 		}
 		break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	get_card = (mmc_tot_in_flight(mq) == 1);
 	cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
 
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(mq->lock);
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (issued != MMC_REQ_STARTED) {
 		bool put_card = false;
 
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(mq->lock);
 		mq->in_flight[issue_type] -= 1;
 		if (mmc_tot_in_flight(mq) == 0)
 			put_card = true;
 		mq->busy = false;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(mq->lock);
 		if (put_card)
 			mmc_put_card(card, &mq->ctx);
 	} else {
@@ -396,6 +396,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	int ret;
 
 	mq->card = card;
+	mq->lock = lock;
 	mq->use_cqe = host->cqe_enabled;
 
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
@@ -426,7 +427,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 		goto free_tag_set;
 	}
 
-	mq->queue->queue_lock = lock;
 	mq->queue->queuedata = mq;
 	blk_queue_rq_timeout(mq->queue, 60 * HZ);
 
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 29218e12900d..5421f1542e71 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -73,6 +73,7 @@ struct mmc_queue_req {
 
 struct mmc_queue {
 	struct mmc_card		*card;
+	spinlock_t		*lock;
 	struct mmc_ctx		ctx;
 	struct blk_mq_tag_set	tag_set;
 	struct mmc_blk_data	*blkdata;

From 6d46964230d182c4b6097379738849a809d791dc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 17:02:18 +0100
Subject: [PATCH 122/351] block: remove the lock argument to
 blk_alloc_queue_node

With the legacy request path gone there is no real need to override the
queue_lock.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c               | 16 +++-------------
 block/blk-mq.c                 |  2 +-
 drivers/block/drbd/drbd_main.c |  2 +-
 drivers/block/null_blk_main.c  |  3 +--
 drivers/block/umem.c           |  2 +-
 drivers/lightnvm/core.c        |  2 +-
 drivers/md/dm.c                |  2 +-
 drivers/nvdimm/pmem.c          |  2 +-
 drivers/nvme/host/multipath.c  |  2 +-
 include/linux/blkdev.h         |  3 +--
 10 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5c8e66a09d82..3f94c9de0252 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -393,7 +393,7 @@ EXPORT_SYMBOL(blk_cleanup_queue);
 
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
-	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL);
+	return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
@@ -473,17 +473,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t)
  * blk_alloc_queue_node - allocate a request queue
  * @gfp_mask: memory allocation flags
  * @node_id: NUMA node to allocate memory from
- * @lock: For legacy queues, pointer to a spinlock that will be used to e.g.
- *        serialize calls to the legacy .request_fn() callback. Ignored for
- *	  blk-mq request queues.
- *
- * Note: pass the queue lock as the third argument to this function instead of
- * setting the queue lock pointer explicitly to avoid triggering a sporadic
- * crash in the blkcg code. This function namely calls blkcg_init_queue() and
- * the queue lock pointer must be set before blkcg_init_queue() is called.
  */
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
-					   spinlock_t *lock)
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	int ret;
@@ -534,8 +525,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
 #endif
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
-
-	q->queue_lock = lock ? : &q->__queue_lock;
+	q->queue_lock = &q->__queue_lock;
 
 	init_waitqueue_head(&q->mq_freeze_wq);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a3f057fdd045..3b823891b3ef 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2548,7 +2548,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct request_queue *uninit_q, *q;
 
-	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL);
+	uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
 	if (!uninit_q)
 		return ERR_PTR(-ENOMEM);
 
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index b66c59ce6260..f973a2a845c8 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2792,7 +2792,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
 
 	drbd_init_set_defaults(device);
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!q)
 		goto out_no_q;
 	device->rq_queue = q;
diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
index 63c23fcfc4df..62c9654b9ce8 100644
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -1659,8 +1659,7 @@ static int null_add_dev(struct nullb_device *dev)
 		}
 		null_init_queues(nullb);
 	} else if (dev->queue_mode == NULL_Q_BIO) {
-		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node,
-						NULL);
+		nullb->q = blk_alloc_queue_node(GFP_KERNEL, dev->home_node);
 		if (!nullb->q) {
 			rv = -ENOMEM;
 			goto out_cleanup_queues;
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8a27b5adc2b3..aa035cf8a51d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -888,7 +888,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	card->biotail = &card->bio;
 	spin_lock_init(&card->lock);
 
-	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	card->queue = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!card->queue)
 		goto failed_alloc;
 
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index efb976a863d2..60ab11fcc81c 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -389,7 +389,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
 		goto err_dev;
 	}
 
-	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node, NULL);
+	tqueue = blk_alloc_queue_node(GFP_KERNEL, dev->q->node);
 	if (!tqueue) {
 		ret = -ENOMEM;
 		goto err_disk;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c510179a7f84..a733e4c920af 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1896,7 +1896,7 @@ static struct mapped_device *alloc_dev(int minor)
 	INIT_LIST_HEAD(&md->table_devices);
 	spin_lock_init(&md->uevent_lock);
 
-	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
+	md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
 	if (!md->queue)
 		goto bad;
 	md->queue->queuedata = md;
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 0e39e3d1846f..f7019294740c 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -393,7 +393,7 @@ static int pmem_attach_disk(struct device *dev,
 		return -EBUSY;
 	}
 
-	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev), NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
 	if (!q)
 		return -ENOMEM;
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..b82b0d3ca39a 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
 	if (!q)
 		goto out;
 	q->queuedata = head;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 30d8e0fbd104..c4a3a660e3f0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1122,8 +1122,7 @@ extern long nr_blockdev_pages(void);
 
 bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id,
-					   spinlock_t *lock);
+struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id);
 extern void blk_put_queue(struct request_queue *);
 extern void blk_set_queue_dying(struct request_queue *);
 

From 0d945c1f966b2bcb67bb12be749da0a7fb00201b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 15 Nov 2018 12:17:28 -0700
Subject: [PATCH 123/351] block: remove the queue_lock indirection

With the legacy request path gone there is no good reason to keep
queue_lock as a pointer, we can always use the embedded lock now.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>

Fixed floppy and blk-cgroup missing conversions and half done edits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c         |  2 +-
 block/bfq-iosched.c        | 16 +++++-----
 block/blk-cgroup.c         | 62 +++++++++++++++++++-------------------
 block/blk-core.c           | 10 +-----
 block/blk-ioc.c            | 14 ++++-----
 block/blk-iolatency.c      |  4 +--
 block/blk-mq-sched.c       |  4 +--
 block/blk-pm.c             | 20 ++++++------
 block/blk-pm.h             |  6 ++--
 block/blk-sysfs.c          |  4 +--
 block/blk-throttle.c       | 22 +++++++-------
 drivers/block/floppy.c     |  8 ++---
 drivers/block/pktcdvd.c    |  4 +--
 drivers/ide/ide-pm.c       | 10 +++---
 include/linux/blk-cgroup.h |  4 +--
 include/linux/blkdev.h     |  8 +----
 16 files changed, 92 insertions(+), 106 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 9fe5952d117d..a7a1712632b0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
 
 	parent = bfqg_parent(bfqg);
 
-	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+	lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock);
 
 	if (unlikely(!parent))
 		return;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index c7636cbefc85..67b22c924aee 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
 		unsigned long flags;
 		struct bfq_io_cq *icq;
 
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(&q->queue_lock, flags);
 		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(&q->queue_lock, flags);
 
 		return icq;
 	}
@@ -4034,7 +4034,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
 	 * In addition, the following queue lock guarantees that
 	 * bfqq_group(bfqq) exists as well.
 	 */
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (idle_timer_disabled)
 		/*
 		 * Since the idle timer has been disabled,
@@ -4053,7 +4053,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q,
 		bfqg_stats_set_start_empty_time(bfqg);
 		bfqg_stats_update_io_remove(bfqg, rq->cmd_flags);
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 #else
 static inline void bfq_update_dispatch_stats(struct request_queue *q,
@@ -4637,11 +4637,11 @@ static void bfq_update_insert_stats(struct request_queue *q,
 	 * In addition, the following queue lock guarantees that
 	 * bfqq_group(bfqq) exists as well.
 	 */
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags);
 	if (idle_timer_disabled)
 		bfqg_stats_update_idle_time(bfqq_group(bfqq));
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 #else
 static inline void bfq_update_insert_stats(struct request_queue *q,
@@ -5382,9 +5382,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
 	}
 	eq->elevator_data = bfqd;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	q->elevator = eq;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	/*
 	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 3ba23b9bfeb9..0f6b44614165 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -147,7 +147,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 	blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
 	if (blkg && blkg->q == q) {
 		if (update_hint) {
-			lockdep_assert_held(q->queue_lock);
+			lockdep_assert_held(&q->queue_lock);
 			rcu_assign_pointer(blkcg->blkg_hint, blkg);
 		}
 		return blkg;
@@ -170,7 +170,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	int i, ret;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget_online(&blkcg->css)) {
@@ -268,7 +268,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	struct blkcg_gq *blkg;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	blkg = __blkg_lookup(blkcg, q, true);
 	if (blkg)
@@ -299,7 +299,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 	struct blkcg_gq *parent = blkg->parent;
 	int i;
 
-	lockdep_assert_held(blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->q->queue_lock);
 	lockdep_assert_held(&blkcg->lock);
 
 	/* Something wrong if we are trying to remove same group twice */
@@ -349,7 +349,7 @@ static void blkg_destroy_all(struct request_queue *q)
 {
 	struct blkcg_gq *blkg, *n;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct blkcg *blkcg = blkg->blkcg;
 
@@ -359,7 +359,7 @@ static void blkg_destroy_all(struct request_queue *q)
 	}
 
 	q->root_blkg = NULL;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 
 /*
@@ -454,10 +454,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
-		spin_lock_irq(blkg->q->queue_lock);
+		spin_lock_irq(&blkg->q->queue_lock);
 		if (blkcg_policy_enabled(blkg->q, pol))
 			total += prfill(sf, blkg->pd[pol->plid], data);
-		spin_unlock_irq(blkg->q->queue_lock);
+		spin_unlock_irq(&blkg->q->queue_lock);
 	}
 	rcu_read_unlock();
 
@@ -655,7 +655,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
 	struct cgroup_subsys_state *pos_css;
 	u64 sum = 0;
 
-	lockdep_assert_held(blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->q->queue_lock);
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -698,7 +698,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 	struct blkg_rwstat sum = { };
 	int i;
 
-	lockdep_assert_held(blkg->q->queue_lock);
+	lockdep_assert_held(&blkg->q->queue_lock);
 
 	rcu_read_lock();
 	blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
@@ -729,7 +729,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
 					  struct request_queue *q)
 {
 	WARN_ON_ONCE(!rcu_read_lock_held());
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	if (!blkcg_policy_enabled(q, pol))
 		return ERR_PTR(-EOPNOTSUPP);
@@ -750,7 +750,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx)
-	__acquires(rcu) __acquires(disk->queue->queue_lock)
+	__acquires(rcu) __acquires(&disk->queue->queue_lock)
 {
 	struct gendisk *disk;
 	struct request_queue *q;
@@ -778,7 +778,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	q = disk->queue;
 
 	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 
 	blkg = blkg_lookup_check(blkcg, pol, q);
 	if (IS_ERR(blkg)) {
@@ -805,7 +805,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		}
 
 		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		rcu_read_unlock();
 
 		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
@@ -815,7 +815,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		}
 
 		rcu_read_lock();
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 
 		blkg = blkg_lookup_check(pos, pol, q);
 		if (IS_ERR(blkg)) {
@@ -843,7 +843,7 @@ success:
 	return 0;
 
 fail_unlock:
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	rcu_read_unlock();
 fail:
 	put_disk_and_module(disk);
@@ -868,9 +868,9 @@ fail:
  * with blkg_conf_prep().
  */
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
+	__releases(&ctx->disk->queue->queue_lock) __releases(rcu)
 {
-	spin_unlock_irq(ctx->disk->queue->queue_lock);
+	spin_unlock_irq(&ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
 	put_disk_and_module(ctx->disk);
 }
@@ -903,7 +903,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		 */
 		off += scnprintf(buf+off, size-off, "%s ", dname);
 
-		spin_lock_irq(blkg->q->queue_lock);
+		spin_lock_irq(&blkg->q->queue_lock);
 
 		rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
 					offsetof(struct blkcg_gq, stat_bytes));
@@ -917,7 +917,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 		wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
 		dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
 
-		spin_unlock_irq(blkg->q->queue_lock);
+		spin_unlock_irq(&blkg->q->queue_lock);
 
 		if (rbytes || wbytes || rios || wios) {
 			has_stats = true;
@@ -1038,9 +1038,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg)
 						struct blkcg_gq, blkcg_node);
 		struct request_queue *q = blkg->q;
 
-		if (spin_trylock(q->queue_lock)) {
+		if (spin_trylock(&q->queue_lock)) {
 			blkg_destroy(blkg);
-			spin_unlock(q->queue_lock);
+			spin_unlock(&q->queue_lock);
 		} else {
 			spin_unlock_irq(&blkcg->lock);
 			cpu_relax();
@@ -1161,12 +1161,12 @@ int blkcg_init_queue(struct request_queue *q)
 
 	/* Make sure the root blkg exists. */
 	rcu_read_lock();
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	blkg = blkg_create(&blkcg_root, q, new_blkg);
 	if (IS_ERR(blkg))
 		goto err_unlock;
 	q->root_blkg = blkg;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	rcu_read_unlock();
 
 	if (preloaded)
@@ -1185,7 +1185,7 @@ err_destroy_all:
 	blkg_destroy_all(q);
 	return ret;
 err_unlock:
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	rcu_read_unlock();
 	if (preloaded)
 		radix_tree_preload_end();
@@ -1200,7 +1200,7 @@ err_unlock:
  */
 void blkcg_drain_queue(struct request_queue *q)
 {
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	/*
 	 * @q could be exiting and already have destroyed all blkgs as
@@ -1335,7 +1335,7 @@ pd_prealloc:
 		}
 	}
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 
 	list_for_each_entry(blkg, &q->blkg_list, q_node) {
 		struct blkg_policy_data *pd;
@@ -1347,7 +1347,7 @@ pd_prealloc:
 		if (!pd)
 			swap(pd, pd_prealloc);
 		if (!pd) {
-			spin_unlock_irq(q->queue_lock);
+			spin_unlock_irq(&q->queue_lock);
 			goto pd_prealloc;
 		}
 
@@ -1361,7 +1361,7 @@ pd_prealloc:
 	__set_bit(pol->plid, q->blkcg_pols);
 	ret = 0;
 
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 out_bypass_end:
 	if (q->mq_ops)
 		blk_mq_unfreeze_queue(q);
@@ -1390,7 +1390,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	if (q->mq_ops)
 		blk_mq_freeze_queue(q);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
 
@@ -1403,7 +1403,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
 		}
 	}
 
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (q->mq_ops)
 		blk_mq_unfreeze_queue(q);
diff --git a/block/blk-core.c b/block/blk-core.c
index 3f94c9de0252..92b6b200e9fb 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -327,8 +327,6 @@ void blk_exit_queue(struct request_queue *q)
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
-	spinlock_t *lock = q->queue_lock;
-
 	/* mark @q DYING, no new request or merges will be allowed afterwards */
 	mutex_lock(&q->sysfs_lock);
 	blk_set_queue_dying(q);
@@ -381,11 +379,6 @@ void blk_cleanup_queue(struct request_queue *q)
 
 	percpu_ref_exit(&q->q_usage_counter);
 
-	spin_lock_irq(lock);
-	if (q->queue_lock != &q->__queue_lock)
-		q->queue_lock = &q->__queue_lock;
-	spin_unlock_irq(lock);
-
 	/* @q is and will stay empty, shutdown and put */
 	blk_put_queue(q);
 }
@@ -524,8 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	mutex_init(&q->blk_trace_mutex);
 #endif
 	mutex_init(&q->sysfs_lock);
-	spin_lock_init(&q->__queue_lock);
-	q->queue_lock = &q->__queue_lock;
+	spin_lock_init(&q->queue_lock);
 
 	init_waitqueue_head(&q->mq_freeze_wq);
 
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index f91ca6b70d6a..5ed59ac6ae58 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -110,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work)
 						struct io_cq, ioc_node);
 		struct request_queue *q = icq->q;
 
-		if (spin_trylock(q->queue_lock)) {
+		if (spin_trylock(&q->queue_lock)) {
 			ioc_destroy_icq(icq);
-			spin_unlock(q->queue_lock);
+			spin_unlock(&q->queue_lock);
 		} else {
 			spin_unlock_irqrestore(&ioc->lock, flags);
 			cpu_relax();
@@ -233,9 +233,9 @@ void ioc_clear_queue(struct request_queue *q)
 {
 	LIST_HEAD(icq_list);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	list_splice_init(&q->icq_list, &icq_list);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	__ioc_clear_queue(&icq_list);
 }
@@ -326,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
 {
 	struct io_cq *icq;
 
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	/*
 	 * icq's are indexed from @ioc using radix tree and hint pointer,
@@ -385,7 +385,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	INIT_HLIST_NODE(&icq->ioc_node);
 
 	/* lock both q and ioc and try to link @icq */
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	spin_lock(&ioc->lock);
 
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
@@ -401,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	}
 
 	spin_unlock(&ioc->lock);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	radix_tree_preload_end();
 	return icq;
 }
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 8edf1b353ad1..5f7f1773be61 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -485,11 +485,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 	bio_associate_blkcg(bio, &blkcg->css);
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 	}
 	if (!blkg)
 		goto out;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 66fda19be5a3..d084f731d104 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -37,9 +37,9 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
 	struct io_context *ioc = rq_ioc(bio);
 	struct io_cq *icq;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	icq = ioc_lookup_icq(ioc, q);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (!icq) {
 		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
diff --git a/block/blk-pm.c b/block/blk-pm.c
index f8fdae01bea2..0a028c189897 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q)
 	/* Switch q_usage_counter back to per-cpu mode. */
 	blk_mq_unfreeze_queue(q);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (ret < 0)
 		pm_runtime_mark_last_busy(q->dev);
 	else
 		q->rpm_status = RPM_SUSPENDING;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (ret)
 		blk_clear_pm_only(q);
@@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err)
 	if (!q->dev)
 		return;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (!err) {
 		q->rpm_status = RPM_SUSPENDED;
 	} else {
 		q->rpm_status = RPM_ACTIVE;
 		pm_runtime_mark_last_busy(q->dev);
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (err)
 		blk_clear_pm_only(q);
@@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q)
 	if (!q->dev)
 		return;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	q->rpm_status = RPM_RESUMING;
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 EXPORT_SYMBOL(blk_pre_runtime_resume);
 
@@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
 	if (!q->dev)
 		return;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (!err) {
 		q->rpm_status = RPM_ACTIVE;
 		pm_runtime_mark_last_busy(q->dev);
@@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err)
 	} else {
 		q->rpm_status = RPM_SUSPENDED;
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (!err)
 		blk_clear_pm_only(q);
@@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
  */
 void blk_set_runtime_active(struct request_queue *q)
 {
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	q->rpm_status = RPM_ACTIVE;
 	pm_runtime_mark_last_busy(q->dev);
 	pm_request_autosuspend(q->dev);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
index a8564ea72a41..ea5507d23e75 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
 
 static inline void blk_pm_requeue_request(struct request *rq)
 {
-	lockdep_assert_held(rq->q->queue_lock);
+	lockdep_assert_held(&rq->q->queue_lock);
 
 	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		rq->q->nr_pending--;
@@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq)
 static inline void blk_pm_add_request(struct request_queue *q,
 				      struct request *rq)
 {
-	lockdep_assert_held(q->queue_lock);
+	lockdep_assert_held(&q->queue_lock);
 
 	if (q->dev && !(rq->rq_flags & RQF_PM))
 		q->nr_pending++;
@@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q,
 
 static inline void blk_pm_put_request(struct request *rq)
 {
-	lockdep_assert_held(rq->q->queue_lock);
+	lockdep_assert_held(&rq->q->queue_lock);
 
 	if (rq->q->dev && !(rq->rq_flags & RQF_PM))
 		--rq->q->nr_pending;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 22fd086eba9f..1e370207a20e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -238,10 +238,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	q->limits.max_sectors = max_sectors_kb << 1;
 	q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	return ret;
 }
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a665b0950369..d0a23f0bb3ed 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	bool dispatched;
 	int ret;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (throtl_can_upgrade(td, NULL))
 		throtl_upgrade_state(td);
 
@@ -1266,9 +1266,9 @@ again:
 			break;
 
 		/* this dispatch windows is still open, relax and repeat */
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		cpu_relax();
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 	}
 
 	if (!dispatched)
@@ -1290,7 +1290,7 @@ again:
 		queue_work(kthrotld_workqueue, &td->dispatch_work);
 	}
 out_unlock:
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 }
 
 /**
@@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
 
 	bio_list_init(&bio_list_on_stack);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	for (rw = READ; rw <= WRITE; rw++)
 		while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
 			bio_list_add(&bio_list_on_stack, bio);
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	if (!bio_list_empty(&bio_list_on_stack)) {
 		blk_start_plug(&plug);
@@ -2141,7 +2141,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 	if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
 		goto out;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 
 	throtl_update_latency_buckets(td);
 
@@ -2224,7 +2224,7 @@ again:
 	}
 
 out_unlock:
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 out:
 	bio_set_flag(bio, BIO_THROTTLED);
 
@@ -2345,7 +2345,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq)
  * Dispatch all currently throttled bios on @q through ->make_request_fn().
  */
 void blk_throtl_drain(struct request_queue *q)
-	__releases(q->queue_lock) __acquires(q->queue_lock)
+	__releases(&q->queue_lock) __acquires(&q->queue_lock)
 {
 	struct throtl_data *td = q->td;
 	struct blkcg_gq *blkg;
@@ -2368,7 +2368,7 @@ void blk_throtl_drain(struct request_queue *q)
 	tg_drain_bios(&td->service_queue);
 
 	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 
 	/* all bios now should be in td->service_queue, issue them */
 	for (rw = READ; rw <= WRITE; rw++)
@@ -2376,7 +2376,7 @@ void blk_throtl_drain(struct request_queue *q)
 						NULL)))
 			generic_make_request(bio);
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 }
 
 int blk_throtl_init(struct request_queue *q)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index a8cfa011c284..eeb4be8d000b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2255,9 +2255,9 @@ static void request_done(int uptodate)
 			DRS->maxtrack = 1;
 
 		/* unlock chained buffers */
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, 0);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(&q->queue_lock, flags);
 	} else {
 		if (rq_data_dir(req) == WRITE) {
 			/* record write error information */
@@ -2269,9 +2269,9 @@ static void request_done(int uptodate)
 			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
-		spin_lock_irqsave(q->queue_lock, flags);
+		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, BLK_STS_IOERR);
-		spin_unlock_irqrestore(q->queue_lock, flags);
+		spin_unlock_irqrestore(&q->queue_lock, flags);
 	}
 }
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 9381f4e3b221..4adf4c8861cd 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,9 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
 		 */
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 		blk_queue_max_hw_sectors(q, pd->settings.size);
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		set_bit(PACKET_WRITABLE, &pd->flags);
 	} else {
 		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index a8c53c98252d..51fe10ac02fa 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -44,15 +44,15 @@ static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	spin_lock_irq(q->queue_lock);
+	spin_lock_irq(&q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	spin_unlock_irq(q->queue_lock);
+	spin_unlock_irq(&q->queue_lock);
 	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;
@@ -214,12 +214,12 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 	printk("%s: completing PM request, %s\n", drive->name,
 	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
 #endif
-	spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(&q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
 		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 2c68efc603bd..a9e2e2037129 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -717,11 +717,11 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
-		spin_lock_irq(q->queue_lock);
+		spin_lock_irq(&q->queue_lock);
 		blkg = blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
-		spin_unlock_irq(q->queue_lock);
+		spin_unlock_irq(&q->queue_lock);
 	}
 
 	throtl = blk_throtl_bio(q, blkg, bio);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c4a3a660e3f0..1d185f1fc333 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -446,13 +446,7 @@ struct request_queue {
 	 */
 	gfp_t			bounce_gfp;
 
-	/*
-	 * protects queue structures from reentrancy. ->__queue_lock should
-	 * _never_ be used directly, it is queue private. always use
-	 * ->queue_lock.
-	 */
-	spinlock_t		__queue_lock;
-	spinlock_t		*queue_lock;
+	spinlock_t		queue_lock;
 
 	/*
 	 * queue kobject

From e815f404afdb70beefe2ffd0d767bf3785985869 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 12:31:27 -0700
Subject: [PATCH 124/351] block: add wbt_disable_default export for BFQ

This isn't unused, if BFQ is modular we get into trouble.

Fixes: b6676f653f13 ("block: remove a few unused exports")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-wbt.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 919444d75489..9f142b84dc85 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -754,6 +754,7 @@ void wbt_disable_default(struct request_queue *q)
 	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
 		rwb->wb_normal = 0;
 }
+EXPORT_SYMBOL_GPL(wbt_disable_default);
 
 static struct rq_qos_ops wbt_rqos_ops = {
 	.throttle = wbt_wait,

From db29eb059cdc571f9d75cec4a41b9884b3b8286a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 16:05:02 -0700
Subject: [PATCH 125/351] nvme: fix handling of EINVAL on
 pci_alloc_irq_vectors_affinity()

At least on SPARC, if MSI/MSI-X isn't supported, we get EINVAL if
we ask for more than one vector. This isn't covered by our ENOSPC
check.

If we get EINVAL, decrease our ask to just one vector, instead of
bailing out in error.

Fixes: 3b6592f70ad7 ("nvme: utilize two queue maps, one for reads and one for writes")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ffbab5b01df4..41730190d932 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2088,15 +2088,11 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 			affd.nr_sets = 1;
 
 		/*
-		 * Need IRQs for read+write queues, and one for the admin queue.
-		 * If we can't get more than one vector, we have to share the
-		 * admin queue and IO queue vector. For that case, don't add
-		 * an extra vector for the admin queue, or we'll continue
-		 * asking for 2 and get -ENOSPC in return.
+		 * If we got a failure and we're down to asking for just
+		 * 1 + 1 queues, just ask for a single vector. We'll share
+		 * that between the single IO queue and the admin queue.
 		 */
-		if (result == -ENOSPC && nr_io_queues == 1)
-			nr_io_queues = 1;
-		else
+		if (!(result < 0 && nr_io_queues == 1))
 			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
 
 		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
@@ -2104,13 +2100,19 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 
 		/*
-		 * Need to reduce our vec counts
+		 * Need to reduce our vec counts. If we get ENOSPC, the
+		 * platform should support mulitple vecs, we just need
+		 * to decrease our ask. If we get EINVAL, the platform
+		 * likely does not. Back down to ask for just one vector.
 		 */
 		if (result == -ENOSPC) {
 			nr_io_queues--;
 			if (!nr_io_queues)
 				return result;
 			continue;
+		} else if (result == -EINVAL) {
+			nr_io_queues = 1;
+			continue;
 		} else if (result <= 0)
 			return -EIO;
 		break;

From 9334ae5e6f9972110c2be136178ca2591c072b62 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 19:42:07 -0700
Subject: [PATCH 126/351] ide: clear ide_req()->special for non-passthrough
 requests

The initial patch cleared this for all requests, which is wrong
since internal uses can't have this cleared as that's what they
are using to pass data. The fix moved the initialization to the
mq_ops->initialize_rq_fn(), but that's only a partial fix since
it only catches uses from blk_get_request(), not requests coming
from the file system.

Keep the non-fs initialization, and add the IDE entry clear
IFF RQF_DONTPREP isn't set and it's a passthrough request.

Fixes: d16a67667c61 ("ide: don't clear special on ide_queue_rq() entry")
Fixes: 22ce0a7ccf23 ("ide: don't use req->special")
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index c0dd0fad16a3..8445b484ae69 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -463,6 +463,11 @@ blk_status_t ide_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct request	*rq = bd->rq;
 	ide_startstop_t	startstop;
 
+	if (!blk_rq_is_passthrough(rq) && !(rq->rq_flags & RQF_DONTPREP)) {
+		rq->rq_flags |= RQF_DONTPREP;
+		ide_req(rq)->special = NULL;
+	}
+
 	/* HLD do_request() callback might sleep, make sure it's okay */
 	might_sleep();
 

From dabcefab45d36ecb5a22f16577bb0f298876a22d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 14 Nov 2018 09:38:28 -0700
Subject: [PATCH 127/351] nvme: provide optimized poll function for separate
 poll queues

If we have separate poll queues, we know that they aren't using
interrupts. Hence we don't need to disable interrupts around
finding completions.

Provide a separate set of blk_mq_ops for such devices.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 45 +++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 41730190d932..89874e23e422 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1082,6 +1082,23 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 	return __nvme_poll(nvmeq, tag);
 }
 
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+	u16 start, end;
+	bool found;
+
+	if (!nvme_cqe_pending(nvmeq))
+		return 0;
+
+	spin_lock(&nvmeq->cq_lock);
+	found = nvme_process_cq(nvmeq, &start, &end, tag);
+	spin_unlock(&nvmeq->cq_lock);
+
+	nvme_complete_cqes(nvmeq, start, end);
+	return found;
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -1584,17 +1601,25 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.timeout	= nvme_timeout,
 };
 
+#define NVME_SHARED_MQ_OPS					\
+	.queue_rq		= nvme_queue_rq,		\
+	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
+	.complete		= nvme_pci_complete_rq,		\
+	.init_hctx		= nvme_init_hctx,		\
+	.init_request		= nvme_init_request,		\
+	.map_queues		= nvme_pci_map_queues,		\
+	.timeout		= nvme_timeout			\
+
 static const struct blk_mq_ops nvme_mq_ops = {
-	.queue_rq		= nvme_queue_rq,
-	.rq_flags_to_type	= nvme_rq_flags_to_type,
-	.complete		= nvme_pci_complete_rq,
-	.init_hctx		= nvme_init_hctx,
-	.init_request		= nvme_init_request,
-	.map_queues		= nvme_pci_map_queues,
-	.timeout		= nvme_timeout,
+	NVME_SHARED_MQ_OPS,
 	.poll			= nvme_poll,
 };
 
+static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+	NVME_SHARED_MQ_OPS,
+	.poll			= nvme_poll_noirq,
+};
+
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
 {
 	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
@@ -2276,7 +2301,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		dev->tagset.ops = &nvme_mq_ops;
+		if (!dev->io_queues[NVMEQ_TYPE_POLL])
+			dev->tagset.ops = &nvme_mq_ops;
+		else
+			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
+
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;

From 344e9ffcbd1898e1dc04085564a6e05c30ea8199 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 12:22:51 -0700
Subject: [PATCH 128/351] block: add queue_is_mq() helper

Various spots check for q->mq_ops being non-NULL, but provide
a helper to do this instead.

Where the ->mq_ops != NULL check is redundant, remove it.

Since mq == rq-based now that legacy is gone, get rid of the
queue_is_rq_based() and just use queue_is_mq() everywhere.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c     |  8 ++++----
 block/blk-core.c       | 12 ++++++------
 block/blk-flush.c      |  3 +--
 block/blk-mq.c         |  2 +-
 block/blk-sysfs.c      | 14 +++++++-------
 block/blk-throttle.c   |  2 +-
 block/blk-wbt.c        |  2 +-
 block/blk-zoned.c      |  2 +-
 block/bsg.c            |  2 +-
 block/elevator.c       | 11 +++++------
 block/genhd.c          |  8 ++++----
 drivers/md/dm-rq.c     |  2 +-
 drivers/md/dm-table.c  |  4 ++--
 include/linux/blkdev.h |  6 +-----
 14 files changed, 36 insertions(+), 42 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0f6b44614165..63d226a084cd 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1324,7 +1324,7 @@ int blkcg_activate_policy(struct request_queue *q,
 	if (blkcg_policy_enabled(q, pol))
 		return 0;
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_freeze_queue(q);
 pd_prealloc:
 	if (!pd_prealloc) {
@@ -1363,7 +1363,7 @@ pd_prealloc:
 
 	spin_unlock_irq(&q->queue_lock);
 out_bypass_end:
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q);
 	if (pd_prealloc)
 		pol->pd_free_fn(pd_prealloc);
@@ -1387,7 +1387,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_freeze_queue(q);
 
 	spin_lock_irq(&q->queue_lock);
@@ -1405,7 +1405,7 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
 	spin_unlock_irq(&q->queue_lock);
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q);
 }
 EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
diff --git a/block/blk-core.c b/block/blk-core.c
index 92b6b200e9fb..0b684a520a11 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -232,7 +232,7 @@ void blk_sync_queue(struct request_queue *q)
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->timeout_work);
 
-	if (q->mq_ops) {
+	if (queue_is_mq(q)) {
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 
@@ -281,7 +281,7 @@ void blk_set_queue_dying(struct request_queue *q)
 	 */
 	blk_freeze_queue_start(q);
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_wake_waiters(q);
 
 	/* Make blk_queue_enter() reexamine the DYING flag. */
@@ -356,7 +356,7 @@ void blk_cleanup_queue(struct request_queue *q)
 	 * blk_freeze_queue() should be enough for cases of passthrough
 	 * request.
 	 */
-	if (q->mq_ops && blk_queue_init_done(q))
+	if (queue_is_mq(q) && blk_queue_init_done(q))
 		blk_mq_quiesce_queue(q);
 
 	/* for synchronous bio-based driver finish in-flight integrity i/o */
@@ -374,7 +374,7 @@ void blk_cleanup_queue(struct request_queue *q)
 
 	blk_exit_queue(q);
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_free_queue(q);
 
 	percpu_ref_exit(&q->q_usage_counter);
@@ -982,7 +982,7 @@ generic_make_request_checks(struct bio *bio)
 	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
 	 * if queue is not a request based queue.
 	 */
-	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
+	if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
 		goto not_supported;
 
 	if (should_fail_bio(bio))
@@ -1657,7 +1657,7 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
  */
 int blk_lld_busy(struct request_queue *q)
 {
-	if (q->mq_ops && q->mq_ops->busy)
+	if (queue_is_mq(q) && q->mq_ops->busy)
 		return q->mq_ops->busy(q);
 
 	return 0;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index fcd18b158fd6..a3fc7191c694 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -273,8 +273,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * assigned to empty flushes, and we deadlock if we are expecting
 	 * other requests to make progress. Don't defer for that case.
 	 */
-	if (!list_empty(&fq->flush_data_in_flight) &&
-	    !(q->mq_ops && q->elevator) &&
+	if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
 	    time_before(jiffies,
 			fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
 		return;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b823891b3ef..32b246ed44c0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -150,7 +150,7 @@ void blk_freeze_queue_start(struct request_queue *q)
 	freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
 	if (freeze_depth == 1) {
 		percpu_ref_kill(&q->q_usage_counter);
-		if (q->mq_ops)
+		if (queue_is_mq(q))
 			blk_mq_run_hw_queues(q, false);
 	}
 }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 1e370207a20e..80eef48fddc8 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
 	unsigned long nr;
 	int ret, err;
 
-	if (!q->mq_ops)
+	if (!queue_is_mq(q))
 		return -EINVAL;
 
 	ret = queue_var_store(&nr, page, count);
@@ -835,12 +835,12 @@ static void __blk_release_queue(struct work_struct *work)
 
 	blk_queue_free_zone_bitmaps(q);
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_release(q);
 
 	blk_trace_shutdown(q);
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_debugfs_unregister(q);
 
 	bioset_exit(&q->bio_split);
@@ -914,7 +914,7 @@ int blk_register_queue(struct gendisk *disk)
 		goto unlock;
 	}
 
-	if (q->mq_ops) {
+	if (queue_is_mq(q)) {
 		__blk_mq_register_dev(dev, q);
 		blk_mq_debugfs_register(q);
 	}
@@ -925,7 +925,7 @@ int blk_register_queue(struct gendisk *disk)
 
 	blk_throtl_register_queue(q);
 
-	if ((q->mq_ops && q->elevator)) {
+	if (q->elevator) {
 		ret = elv_register_queue(q);
 		if (ret) {
 			mutex_unlock(&q->sysfs_lock);
@@ -974,7 +974,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	 * Remove the sysfs attributes before unregistering the queue data
 	 * structures that can be modified through sysfs.
 	 */
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		blk_mq_unregister_dev(disk_to_dev(disk), q);
 	mutex_unlock(&q->sysfs_lock);
 
@@ -983,7 +983,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	blk_trace_remove_sysfs(disk_to_dev(disk));
 
 	mutex_lock(&q->sysfs_lock);
-	if (q->mq_ops && q->elevator)
+	if (q->elevator)
 		elv_unregister_queue(q);
 	mutex_unlock(&q->sysfs_lock);
 
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d0a23f0bb3ed..8f0a104770ee 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2456,7 +2456,7 @@ void blk_throtl_register_queue(struct request_queue *q)
 	td->throtl_slice = DFL_THROTL_SLICE_HD;
 #endif
 
-	td->track_bio_latency = !queue_is_rq_based(q);
+	td->track_bio_latency = !queue_is_mq(q);
 	if (!td->track_bio_latency)
 		blk_stat_enable_accounting(q);
 }
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 9f142b84dc85..d051ebfb4852 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -701,7 +701,7 @@ void wbt_enable_default(struct request_queue *q)
 	if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
 		return;
 
-	if (q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ))
+	if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
 		wbt_init(q);
 }
 EXPORT_SYMBOL_GPL(wbt_enable_default);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 13ba2011a306..e9c332b1d9da 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	 * BIO based queues do not use a scheduler so only q->nr_zones
 	 * needs to be updated so that the sysfs exposed value is correct.
 	 */
-	if (!queue_is_rq_based(q)) {
+	if (!queue_is_mq(q)) {
 		q->nr_zones = nr_zones;
 		return 0;
 	}
diff --git a/block/bsg.c b/block/bsg.c
index 9a442c23a715..44f6028b9567 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent,
 	/*
 	 * we need a proper transport to send commands, not a stacked device
 	 */
-	if (!queue_is_rq_based(q))
+	if (!queue_is_mq(q))
 		return 0;
 
 	bcd = &q->bsg_dev;
diff --git a/block/elevator.c b/block/elevator.c
index 796436270682..f05e90d4e695 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -667,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	/*
 	 * Special case for mq, turn off scheduling
 	 */
-	if (q->mq_ops && !strncmp(name, "none", 4))
+	if (!strncmp(name, "none", 4))
 		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
@@ -685,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
 
 static inline bool elv_support_iosched(struct request_queue *q)
 {
-	if (q->mq_ops && q->tag_set && (q->tag_set->flags &
-				BLK_MQ_F_NO_SCHED))
+	if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
 		return false;
 	return true;
 }
@@ -696,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!q->mq_ops || !elv_support_iosched(q))
+	if (!queue_is_mq(q) || !elv_support_iosched(q))
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -713,7 +712,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!queue_is_rq_based(q))
+	if (!queue_is_mq(q))
 		return sprintf(name, "none\n");
 
 	if (!q->elevator)
@@ -732,7 +731,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	}
 	spin_unlock(&elv_list_lock);
 
-	if (q->mq_ops && q->elevator)
+	if (q->elevator)
 		len += sprintf(name+len, "none");
 
 	len += sprintf(len+name, "\n");
diff --git a/block/genhd.c b/block/genhd.c
index cff6bdf27226..0145bcb0cc76 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -47,7 +47,7 @@ static void disk_release_events(struct gendisk *disk);
 
 void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 {
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		return;
 
 	atomic_inc(&part->in_flight[rw]);
@@ -57,7 +57,7 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 
 void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 {
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		return;
 
 	atomic_dec(&part->in_flight[rw]);
@@ -68,7 +68,7 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2])
 {
-	if (q->mq_ops) {
+	if (queue_is_mq(q)) {
 		blk_mq_in_flight(q, part, inflight);
 		return;
 	}
@@ -85,7 +85,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 		       unsigned int inflight[2])
 {
-	if (q->mq_ops) {
+	if (queue_is_mq(q)) {
 		blk_mq_in_flight_rw(q, part, inflight);
 		return;
 	}
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 7cd36e4d1310..1f1fe9a618ea 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -43,7 +43,7 @@ static unsigned dm_get_blk_mq_queue_depth(void)
 
 int dm_request_based(struct mapped_device *md)
 {
-	return queue_is_rq_based(md->queue);
+	return queue_is_mq(md->queue);
 }
 
 void dm_start_queue(struct request_queue *q)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 9038c302d5c2..844f7d0f2ef8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -919,12 +919,12 @@ static int device_is_rq_based(struct dm_target *ti, struct dm_dev *dev,
 	struct request_queue *q = bdev_get_queue(dev->bdev);
 	struct verify_rq_based_data *v = data;
 
-	if (q->mq_ops)
+	if (queue_is_mq(q))
 		v->mq_count++;
 	else
 		v->sq_count++;
 
-	return queue_is_rq_based(q);
+	return queue_is_mq(q);
 }
 
 static int dm_table_determine_type(struct dm_table *t)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1d185f1fc333..41aaa05e42c1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -656,11 +656,7 @@ static inline bool blk_account_rq(struct request *rq)
 
 #define rq_data_dir(rq)		(op_is_write(req_op(rq)) ? WRITE : READ)
 
-/*
- * Driver can handle struct request, if it either has an old style
- * request_fn defined, or is blk-mq based.
- */
-static inline bool queue_is_rq_based(struct request_queue *q)
+static inline bool queue_is_mq(struct request_queue *q)
 {
 	return q->mq_ops;
 }

From e504545446e4be916b170f159b6495c352a2b5a5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 12:25:10 -0700
Subject: [PATCH 129/351] blk-rq-qos: inline check for q->rq_qos functions

Put the short code in the fast path, where we don't have any
functions attached to the queue. This minimizes the impact on
the hot path in the core code.

Cc: Josef Bacik <josef@toxicpanda.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-rq-qos.c | 63 +++++++++++++++++++++-------------------------
 block/blk-rq-qos.h | 59 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index f8a4d3fbb98c..80f603b76f61 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -27,74 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit)
 	return atomic_inc_below(&rq_wait->inflight, limit);
 }
 
-void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
+void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio)
 {
-	struct rq_qos *rqos;
-
-	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->cleanup)
 			rqos->ops->cleanup(rqos, bio);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_done(struct request_queue *q, struct request *rq)
+void __rq_qos_done(struct rq_qos *rqos, struct request *rq)
 {
-	struct rq_qos *rqos;
-
-	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->done)
 			rqos->ops->done(rqos, rq);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_issue(struct request_queue *q, struct request *rq)
+void __rq_qos_issue(struct rq_qos *rqos, struct request *rq)
 {
-	struct rq_qos *rqos;
-
-	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->issue)
 			rqos->ops->issue(rqos, rq);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_requeue(struct request_queue *q, struct request *rq)
+void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq)
 {
-	struct rq_qos *rqos;
-
-	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->requeue)
 			rqos->ops->requeue(rqos, rq);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_throttle(struct request_queue *q, struct bio *bio)
+void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio)
 {
-	struct rq_qos *rqos;
-
-	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->throttle)
 			rqos->ops->throttle(rqos, bio);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio)
+void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
 {
-	struct rq_qos *rqos;
-
-	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->track)
 			rqos->ops->track(rqos, rq, bio);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
-void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
 {
-	struct rq_qos *rqos;
-
-	for(rqos = q->rq_qos; rqos; rqos = rqos->next) {
+	do {
 		if (rqos->ops->done_bio)
 			rqos->ops->done_bio(rqos, bio);
-	}
+		rqos = rqos->next;
+	} while (rqos);
 }
 
 /*
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index b6b11d496007..6e09e98b93ea 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -98,12 +98,57 @@ void rq_depth_scale_up(struct rq_depth *rqd);
 void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
 bool rq_depth_calc_max_depth(struct rq_depth *rqd);
 
-void rq_qos_cleanup(struct request_queue *, struct bio *);
-void rq_qos_done(struct request_queue *, struct request *);
-void rq_qos_issue(struct request_queue *, struct request *);
-void rq_qos_requeue(struct request_queue *, struct request *);
-void rq_qos_done_bio(struct request_queue *q, struct bio *bio);
-void rq_qos_throttle(struct request_queue *, struct bio *);
-void rq_qos_track(struct request_queue *q, struct request *, struct bio *);
+void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
+void __rq_qos_done(struct rq_qos *rqos, struct request *rq);
+void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
+void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
+void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
+void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
+void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
+
+static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
+{
+	if (q->rq_qos)
+		__rq_qos_cleanup(q->rq_qos, bio);
+}
+
+static inline void rq_qos_done(struct request_queue *q, struct request *rq)
+{
+	if (q->rq_qos)
+		__rq_qos_done(q->rq_qos, rq);
+}
+
+static inline void rq_qos_issue(struct request_queue *q, struct request *rq)
+{
+	if (q->rq_qos)
+		__rq_qos_issue(q->rq_qos, rq);
+}
+
+static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
+{
+	if (q->rq_qos)
+		__rq_qos_requeue(q->rq_qos, rq);
+}
+
+static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+{
+	if (q->rq_qos)
+		__rq_qos_done_bio(q->rq_qos, bio);
+}
+
+static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
+{
+	if (q->rq_qos)
+		__rq_qos_throttle(q->rq_qos, bio);
+}
+
+static inline void rq_qos_track(struct request_queue *q, struct request *rq,
+				struct bio *bio)
+{
+	if (q->rq_qos)
+		__rq_qos_track(q->rq_qos, rq, bio);
+}
+
 void rq_qos_exit(struct request_queue *);
+
 #endif

From 0619317ff8baa2da9238191ad5167ed3618c16d9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Nov 2018 21:16:54 -0700
Subject: [PATCH 130/351] block: add polled wakeup task helper

If we're polling for IO on a device that doesn't use interrupts, then
IO completion loop (and wake of task) is done by submitting task itself.
If that is the case, then we don't need to enter the wake_up_process()
function, we can simply mark ourselves as TASK_RUNNING.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c         |  4 ++--
 fs/iomap.c             |  2 +-
 include/linux/blkdev.h | 13 +++++++++++++
 mm/page_io.c           |  2 +-
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index c039abfb2052..9fe56672cfe5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -181,7 +181,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 	struct task_struct *waiter = bio->bi_private;
 
 	WRITE_ONCE(bio->bi_private, NULL);
-	wake_up_process(waiter);
+	blk_wake_io_task(waiter);
 }
 
 static ssize_t
@@ -305,7 +305,7 @@ static void blkdev_bio_end_io(struct bio *bio)
 			struct task_struct *waiter = dio->waiter;
 
 			WRITE_ONCE(dio->waiter, NULL);
-			wake_up_process(waiter);
+			blk_wake_io_task(waiter);
 		}
 	}
 
diff --git a/fs/iomap.c b/fs/iomap.c
index f61d13dfdf09..b0462b363bad 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1525,7 +1525,7 @@ static void iomap_dio_bio_end_io(struct bio *bio)
 		if (dio->wait_for_completion) {
 			struct task_struct *waiter = dio->submit.waiter;
 			WRITE_ONCE(dio->submit.waiter, NULL);
-			wake_up_process(waiter);
+			blk_wake_io_task(waiter);
 		} else if (dio->flags & IOMAP_DIO_WRITE) {
 			struct inode *inode = file_inode(dio->iocb->ki_filp);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 41aaa05e42c1..91c44f7a7f62 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1772,4 +1772,17 @@ static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 
 #endif /* CONFIG_BLOCK */
 
+static inline void blk_wake_io_task(struct task_struct *waiter)
+{
+	/*
+	 * If we're polling, the task itself is doing the completions. For
+	 * that case, we don't need to signal a wakeup, it's enough to just
+	 * mark us as RUNNING.
+	 */
+	if (waiter == current)
+		__set_current_state(TASK_RUNNING);
+	else
+		wake_up_process(waiter);
+}
+
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index d4d1c89bcddd..57572ff46016 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -140,7 +140,7 @@ out:
 	unlock_page(page);
 	WRITE_ONCE(bio->bi_private, NULL);
 	bio_put(bio);
-	wake_up_process(waiter);
+	blk_wake_io_task(waiter);
 	put_task_struct(waiter);
 }
 

From d34513d384487e8022f143a3a6b791e6d7f0dad6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Nov 2018 14:29:11 -0700
Subject: [PATCH 131/351] block: for async O_DIRECT, mark us as polling if
 asked to

Inherit the iocb IOCB_HIPRI flag, and pass on REQ_HIPRI for
those kinds of requests.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9fe56672cfe5..e72b119ede84 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -383,6 +383,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 
 		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
 		if (!nr_pages) {
+			if (iocb->ki_flags & IOCB_HIPRI)
+				bio->bi_opf |= REQ_HIPRI;
+
 			qc = submit_bio(bio);
 			break;
 		}

From cb700eb3faa488fbb4b60689adec84032d7cf24a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 15 Nov 2018 19:56:53 -0700
Subject: [PATCH 132/351] block: don't plug for aio/O_DIRECT HIPRI IO

Those will go straight to issue inside blk-mq, so don't bother
setting up a block plug for them.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index e72b119ede84..4d79bc80fb41 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -330,6 +330,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
+	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
@@ -353,7 +354,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 	dio->multi_bio = false;
 	dio->should_dirty = is_read && iter_is_iovec(iter);
 
-	blk_start_plug(&plug);
+	/*
+	 * Don't plug for HIPRI/polled IO, as those should go straight
+	 * to issue
+	 */
+	if (!is_poll)
+		blk_start_plug(&plug);
+
 	for (;;) {
 		bio_set_dev(bio, bdev);
 		bio->bi_iter.bi_sector = pos >> 9;
@@ -400,7 +407,9 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		submit_bio(bio);
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 	}
-	blk_finish_plug(&plug);
+
+	if (!is_poll)
+		blk_finish_plug(&plug);
 
 	if (!is_sync)
 		return -EIOCBQUEUED;

From 2b78eae147a13ab2ca7caa121dd3fca2eecf8613 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:01 +0100
Subject: [PATCH 133/351] block: remove the rq_alloc_data request_queue field

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 91c44f7a7f62..1ad6eafc43f2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -567,7 +567,6 @@ struct request_queue {
 	bool			mq_sysfs_init_done;
 
 	size_t			cmd_size;
-	void			*rq_alloc_data;
 
 	struct work_struct	release_work;
 

From 503f620f0cb8a9da9c2bba72f2141aaa4b0e3962 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:02 +0100
Subject: [PATCH 134/351] floppy: remove queue_lock around floppy_end_request

There is nothing the queue_lock could protect inside floppy_end_request,
so remove it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index eeb4be8d000b..218099dd8e44 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2254,10 +2254,7 @@ static void request_done(int uptodate)
 		if (block > _floppy->sect)
 			DRS->maxtrack = 1;
 
-		/* unlock chained buffers */
-		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, 0);
-		spin_unlock_irqrestore(&q->queue_lock, flags);
 	} else {
 		if (rq_data_dir(req) == WRITE) {
 			/* record write error information */
@@ -2269,9 +2266,7 @@ static void request_done(int uptodate)
 			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
-		spin_lock_irqsave(&q->queue_lock, flags);
 		floppy_end_request(req, BLK_STS_IOERR);
-		spin_unlock_irqrestore(&q->queue_lock, flags);
 	}
 }
 

From a50f9aec1ac713dd25f1b1b443d97aa2fc69f740 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:03 +0100
Subject: [PATCH 135/351] pktcdvd: remove queue_lock around
 blk_queue_max_hw_sectors

blk_queue_max_hw_sectors can't do anything with queue_lock protection
so don't hold it.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/pktcdvd.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 4adf4c8861cd..f5a71023f76c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2203,9 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		 * Some CDRW drives can not handle writes larger than one packet,
 		 * even if the size is a multiple of the packet size.
 		 */
-		spin_lock_irq(&q->queue_lock);
 		blk_queue_max_hw_sectors(q, pd->settings.size);
-		spin_unlock_irq(&q->queue_lock);
 		set_bit(PACKET_WRITABLE, &pd->flags);
 	} else {
 		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);

From b2101f655f8f6bd510154364e24d1cdba037133f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:04 +0100
Subject: [PATCH 136/351] ide: don't acquire queue lock in ide_pm_execute_rq

There is nothing we can synchronize against over a call to
blk_queue_dying.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-pm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 51fe10ac02fa..56690f523100 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -44,15 +44,12 @@ static int ide_pm_execute_rq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
-	spin_lock_irq(&q->queue_lock);
 	if (unlikely(blk_queue_dying(q))) {
 		rq->rq_flags |= RQF_QUIET;
 		scsi_req(rq)->result = -ENXIO;
-		spin_unlock_irq(&q->queue_lock);
 		blk_mq_end_request(rq, BLK_STS_OK);
 		return -ENXIO;
 	}
-	spin_unlock_irq(&q->queue_lock);
 	blk_execute_rq(q, NULL, rq, true);
 
 	return scsi_req(rq)->result ? -EIO : 0;

From f04842734c7a68381ab3cb45628ffb3ff1029490 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:05 +0100
Subject: [PATCH 137/351] ide: don't acquire queue_lock in ide_complete_pm_rq

blk_mq_stop_hw_queues doesn't need any locking, and the ide
dev_flags field isn't protected by it either.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-pm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 56690f523100..192e6c65d34e 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -201,7 +201,6 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
 	struct ide_pm_state *pm = ide_req(rq)->special;
-	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
 	if (pm->pm_step != IDE_PM_COMPLETED)
@@ -211,12 +210,10 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 	printk("%s: completing PM request, %s\n", drive->name,
 	       (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND) ? "suspend" : "resume");
 #endif
-	spin_lock_irqsave(&q->queue_lock, flags);
 	if (ide_req(rq)->type == ATA_PRIV_PM_SUSPEND)
 		blk_mq_stop_hw_queues(q);
 	else
 		drive->dev_flags &= ~IDE_DFLAG_BLOCKED;
-	spin_unlock_irqrestore(&q->queue_lock, flags);
 
 	drive->hwif->rq = NULL;
 

From f5d72c5c55bc392523cbdcdedd575c280203d31c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2018 09:10:06 +0100
Subject: [PATCH 138/351] mmc: stop abusing the request queue_lock pointer

Replace the lock in mmc_blk_data that is only used through a pointer
in struct mmc_queue and to protect fields in that structure with
an actual lock in struct mmc_queue.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/mmc/core/block.c | 24 +++++++++++-------------
 drivers/mmc/core/queue.c | 31 +++++++++++++++----------------
 drivers/mmc/core/queue.h |  4 ++--
 3 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index 70ec465beb69..2c329a3e3fdb 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -100,7 +100,6 @@ static DEFINE_IDA(mmc_rpmb_ida);
  * There is one mmc_blk_data per slot.
  */
 struct mmc_blk_data {
-	spinlock_t	lock;
 	struct device	*parent;
 	struct gendisk	*disk;
 	struct mmc_queue queue;
@@ -1483,7 +1482,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 		blk_mq_end_request(req, BLK_STS_OK);
 	}
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
@@ -1491,7 +1490,7 @@ static void mmc_blk_cqe_complete_rq(struct mmc_queue *mq, struct request *req)
 
 	mmc_cqe_check_busy(mq);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	if (!mq->cqe_busy)
 		blk_mq_run_hw_queues(q, true);
@@ -1991,13 +1990,13 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req)
 	unsigned long flags;
 	bool put_card;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
 
 	put_card = (mmc_tot_in_flight(mq) == 0);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	if (put_card)
 		mmc_put_card(mq->card, &mq->ctx);
@@ -2093,11 +2092,11 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 		 * request does not need to wait (although it does need to
 		 * complete complete_req first).
 		 */
-		spin_lock_irqsave(mq->lock, flags);
+		spin_lock_irqsave(&mq->lock, flags);
 		mq->complete_req = req;
 		mq->rw_wait = false;
 		waiting = mq->waiting;
-		spin_unlock_irqrestore(mq->lock, flags);
+		spin_unlock_irqrestore(&mq->lock, flags);
 
 		/*
 		 * If 'waiting' then the waiting task will complete this
@@ -2116,10 +2115,10 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq)
 	/* Take the recovery path for errors or urgent background operations */
 	if (mmc_blk_rq_error(&mqrq->brq) ||
 	    mmc_blk_urgent_bkops_needed(mq, mqrq)) {
-		spin_lock_irqsave(mq->lock, flags);
+		spin_lock_irqsave(&mq->lock, flags);
 		mq->recovery_needed = true;
 		mq->recovery_req = req;
-		spin_unlock_irqrestore(mq->lock, flags);
+		spin_unlock_irqrestore(&mq->lock, flags);
 		wake_up(&mq->wait);
 		schedule_work(&mq->recovery_work);
 		return;
@@ -2142,7 +2141,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 	 * Wait while there is another request in progress, but not if recovery
 	 * is needed. Also indicate whether there is a request waiting to start.
 	 */
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 	if (mq->recovery_needed) {
 		*err = -EBUSY;
 		done = true;
@@ -2150,7 +2149,7 @@ static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
 		done = !mq->rw_wait;
 	}
 	mq->waiting = !done;
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	return done;
 }
@@ -2327,12 +2326,11 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 		goto err_kfree;
 	}
 
-	spin_lock_init(&md->lock);
 	INIT_LIST_HEAD(&md->part);
 	INIT_LIST_HEAD(&md->rpmbs);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock);
+	ret = mmc_init_queue(&md->queue, card);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 4485cf12218c..35cc138b096d 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -89,9 +89,9 @@ void mmc_cqe_recovery_notifier(struct mmc_request *mrq)
 	struct mmc_queue *mq = q->queuedata;
 	unsigned long flags;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 	__mmc_cqe_recovery_notifier(mq);
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 }
 
 static enum blk_eh_timer_return mmc_cqe_timed_out(struct request *req)
@@ -128,14 +128,14 @@ static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
 	unsigned long flags;
 	int ret;
 
-	spin_lock_irqsave(mq->lock, flags);
+	spin_lock_irqsave(&mq->lock, flags);
 
 	if (mq->recovery_needed || !mq->use_cqe)
 		ret = BLK_EH_RESET_TIMER;
 	else
 		ret = mmc_cqe_timed_out(req);
 
-	spin_unlock_irqrestore(mq->lock, flags);
+	spin_unlock_irqrestore(&mq->lock, flags);
 
 	return ret;
 }
@@ -157,9 +157,9 @@ static void mmc_mq_recovery_handler(struct work_struct *work)
 
 	mq->in_recovery = false;
 
-	spin_lock_irq(mq->lock);
+	spin_lock_irq(&mq->lock);
 	mq->recovery_needed = false;
-	spin_unlock_irq(mq->lock);
+	spin_unlock_irq(&mq->lock);
 
 	mmc_put_card(mq->card, &mq->ctx);
 
@@ -258,10 +258,10 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	issue_type = mmc_issue_type(mq, req);
 
-	spin_lock_irq(mq->lock);
+	spin_lock_irq(&mq->lock);
 
 	if (mq->recovery_needed || mq->busy) {
-		spin_unlock_irq(mq->lock);
+		spin_unlock_irq(&mq->lock);
 		return BLK_STS_RESOURCE;
 	}
 
@@ -269,7 +269,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case MMC_ISSUE_DCMD:
 		if (mmc_cqe_dcmd_busy(mq)) {
 			mq->cqe_busy |= MMC_CQE_DCMD_BUSY;
-			spin_unlock_irq(mq->lock);
+			spin_unlock_irq(&mq->lock);
 			return BLK_STS_RESOURCE;
 		}
 		break;
@@ -294,7 +294,7 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	get_card = (mmc_tot_in_flight(mq) == 1);
 	cqe_retune_ok = (mmc_cqe_qcnt(mq) == 1);
 
-	spin_unlock_irq(mq->lock);
+	spin_unlock_irq(&mq->lock);
 
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		req_to_mmc_queue_req(req)->retries = 0;
@@ -328,12 +328,12 @@ static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (issued != MMC_REQ_STARTED) {
 		bool put_card = false;
 
-		spin_lock_irq(mq->lock);
+		spin_lock_irq(&mq->lock);
 		mq->in_flight[issue_type] -= 1;
 		if (mmc_tot_in_flight(mq) == 0)
 			put_card = true;
 		mq->busy = false;
-		spin_unlock_irq(mq->lock);
+		spin_unlock_irq(&mq->lock);
 		if (put_card)
 			mmc_put_card(card, &mq->ctx);
 	} else {
@@ -385,19 +385,18 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
  * mmc_init_queue - initialise a queue structure.
  * @mq: mmc queue
  * @card: mmc card to attach this queue
- * @lock: queue lock
  *
  * Initialise a MMC card request queue.
  */
-int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock)
+int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
 	int ret;
 
 	mq->card = card;
-	mq->lock = lock;
 	mq->use_cqe = host->cqe_enabled;
+	
+	spin_lock_init(&mq->lock);
 
 	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
 	mq->tag_set.ops = &mmc_mq_ops;
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 5421f1542e71..fd11491ced9f 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -73,11 +73,11 @@ struct mmc_queue_req {
 
 struct mmc_queue {
 	struct mmc_card		*card;
-	spinlock_t		*lock;
 	struct mmc_ctx		ctx;
 	struct blk_mq_tag_set	tag_set;
 	struct mmc_blk_data	*blkdata;
 	struct request_queue	*queue;
+	spinlock_t		lock;
 	int			in_flight[MMC_ISSUE_MAX];
 	unsigned int		cqe_busy;
 #define MMC_CQE_DCMD_BUSY	BIT(0)
@@ -96,7 +96,7 @@ struct mmc_queue {
 	struct work_struct	complete_work;
 };
 
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);

From fce15a609f8f30cfacfaf684729add9582be780b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 18 Nov 2018 15:42:48 -0700
Subject: [PATCH 139/351] floppy: remove now unused 'flags' variable

With the locking removed, it's unused. Kill it.

Fixes: 503f620f0cb8 ("floppy: remove queue_lock around floppy_end_request")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/floppy.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 218099dd8e44..cad339ec9c19 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2231,7 +2231,6 @@ static void request_done(int uptodate)
 {
 	struct request *req = current_req;
 	struct request_queue *q;
-	unsigned long flags;
 	int block;
 	char msg[sizeof("request done ") + sizeof(int) * 3];
 

From a4668d9ba4be1ca9f4a39798ba3419fdfef0750d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 08:18:24 -0700
Subject: [PATCH 140/351] nvme: default to 0 poll queues

We need a better way of configuring this, and given that polling is
(still) a bit niche, let's default to using 0 poll queues. That way
we'll have the same read/write/poll behavior as 4.20, and users that
want to test/use polling are required to do manual configuration of the
number of poll queues.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 89874e23e422..57e790391b82 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,7 @@ MODULE_PARM_DESC(write_queues,
 	"Number of queues to use for writes. If not set, reads and writes "
 	"will share a queue set.");
 
-static int poll_queues = 1;
+static int poll_queues = 0;
 module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
 MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
 

From 849a370016a5489c49253338507ee6cc4a08df4b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 Nov 2018 08:37:34 -0700
Subject: [PATCH 141/351] block: avoid ordered task state change for polled IO

For the core poll helper, the task state setting don't need to imply any
atomics, as it's the current task itself that is being modified and
we're not going to sleep.

For IRQ driven, the wakeup path have the necessary barriers to not need
us using the heavy handed version of the task state setting.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 ++--
 fs/block_dev.c | 7 +++++--
 fs/iomap.c     | 3 ++-
 mm/page_io.c   | 3 ++-
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32b246ed44c0..7fc4abb4cc36 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3331,12 +3331,12 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		ret = q->mq_ops->poll(hctx, rq->tag);
 		if (ret > 0) {
 			hctx->poll_success++;
-			set_current_state(TASK_RUNNING);
+			__set_current_state(TASK_RUNNING);
 			return true;
 		}
 
 		if (signal_pending_state(state, current))
-			set_current_state(TASK_RUNNING);
+			__set_current_state(TASK_RUNNING);
 
 		if (current->state == TASK_RUNNING)
 			return true;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4d79bc80fb41..64ba27b8b754 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -237,9 +237,11 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 
 	qc = submit_bio(&bio);
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+
 		if (!READ_ONCE(bio.bi_private))
 			break;
+
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
 		    !blk_poll(bdev_get_queue(bdev), qc))
 			io_schedule();
@@ -415,7 +417,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		return -EIOCBQUEUED;
 
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+
 		if (!READ_ONCE(dio->waiter))
 			break;
 
diff --git a/fs/iomap.c b/fs/iomap.c
index b0462b363bad..c5df035ace6f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1888,7 +1888,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			return -EIOCBQUEUED;
 
 		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+
 			if (!READ_ONCE(dio->submit.waiter))
 				break;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 57572ff46016..a7271fa481f6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -405,7 +405,8 @@ int swap_readpage(struct page *page, bool synchronous)
 	bio_get(bio);
 	qc = submit_bio(bio);
 	while (synchronous) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+
 		if (!READ_ONCE(bio->bi_private))
 			break;
 

From 85f4d4b65fdd67f1d6dc9eeb1d91923cef07eb6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 6 Nov 2018 13:30:55 -0700
Subject: [PATCH 142/351] block: have ->poll_fn() return number of entries
 polled

We currently only really support sync poll, ie poll with 1 IO in flight.
This prepares us for supporting async poll.

Note that the returned value isn't necessarily 100% accurate. If poll
races with IRQ completion, we assume that the fact that the task is now
runnable means we found at least one entry. In reality it could be more
than 1, or not even 1. This is fine, the caller will just need to take
this into account.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c                | 18 +++++++++---------
 drivers/nvme/host/multipath.c |  4 ++--
 include/linux/blkdev.h        |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7fc4abb4cc36..52b1c97cd7c6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,7 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -3305,7 +3305,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	return true;
 }
 
-static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
 	struct request_queue *q = hctx->queue;
 	long state;
@@ -3318,7 +3318,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	 * straight to the busy poll loop.
 	 */
 	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
-		return true;
+		return 1;
 
 	hctx->poll_considered++;
 
@@ -3332,30 +3332,30 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 		if (ret > 0) {
 			hctx->poll_success++;
 			__set_current_state(TASK_RUNNING);
-			return true;
+			return ret;
 		}
 
 		if (signal_pending_state(state, current))
 			__set_current_state(TASK_RUNNING);
 
 		if (current->state == TASK_RUNNING)
-			return true;
+			return 1;
 		if (ret < 0)
 			break;
 		cpu_relax();
 	}
 
 	__set_current_state(TASK_RUNNING);
-	return false;
+	return 0;
 }
 
-static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
 
 	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return false;
+		return 0;
 
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 	if (!blk_qc_t_is_internal(cookie))
@@ -3369,7 +3369,7 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 		 * so we should be safe with just the NULL check.
 		 */
 		if (!rq)
-			return false;
+			return 0;
 	}
 
 	return __blk_mq_poll(hctx, rq);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8b841f39734c..f9eeb3b58632 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,11 +220,11 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 {
 	struct nvme_ns_head *head = q->queuedata;
 	struct nvme_ns *ns;
-	bool found = false;
+	int found = 0;
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1ad6eafc43f2..e97c0a3b2262 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef bool (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);

From 92f806d678e5136e4777b21e5ed5368482ac9ea9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 11:37:31 -0700
Subject: [PATCH 143/351] nvme-fc: remove ->poll implementation

It's specifically looking for a given request, which we will not be
supporting going forward. Also kill the qla2xxx poll implementation
as that's the only user of the nvme-fc poll, and the now unused
->poll_queue() hook.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by:  James Smart <jsmart2021@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fc.c          | 33 ---------------------------------
 drivers/scsi/qla2xxx/qla_nvme.c | 12 ------------
 include/linux/nvme-fc-driver.h  |  1 -
 3 files changed, 46 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 98c3c77f48f6..de797c641265 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2302,38 +2302,6 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return nvme_fc_start_fcp_op(ctrl, queue, op, data_len, io_dir);
 }
 
-static struct blk_mq_tags *
-nvme_fc_tagset(struct nvme_fc_queue *queue)
-{
-	if (queue->qnum == 0)
-		return queue->ctrl->admin_tag_set.tags[queue->qnum];
-
-	return queue->ctrl->tag_set.tags[queue->qnum - 1];
-}
-
-static int
-nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
-
-{
-	struct nvme_fc_queue *queue = hctx->driver_data;
-	struct nvme_fc_ctrl *ctrl = queue->ctrl;
-	struct request *req;
-	struct nvme_fc_fcp_op *op;
-
-	req = blk_mq_tag_to_rq(nvme_fc_tagset(queue), tag);
-	if (!req)
-		return 0;
-
-	op = blk_mq_rq_to_pdu(req);
-
-	if ((atomic_read(&op->state) == FCPOP_STATE_ACTIVE) &&
-		 (ctrl->lport->ops->poll_queue))
-		ctrl->lport->ops->poll_queue(&ctrl->lport->localport,
-						 queue->lldd_handle);
-
-	return ((atomic_read(&op->state) != FCPOP_STATE_ACTIVE));
-}
-
 static void
 nvme_fc_submit_async_event(struct nvme_ctrl *arg)
 {
@@ -2404,7 +2372,6 @@ static const struct blk_mq_ops nvme_fc_mq_ops = {
 	.init_request	= nvme_fc_init_request,
 	.exit_request	= nvme_fc_exit_request,
 	.init_hctx	= nvme_fc_init_hctx,
-	.poll		= nvme_fc_poll,
 	.timeout	= nvme_fc_timeout,
 };
 
diff --git a/drivers/scsi/qla2xxx/qla_nvme.c b/drivers/scsi/qla2xxx/qla_nvme.c
index 7e78e7eff783..fccc733145fc 100644
--- a/drivers/scsi/qla2xxx/qla_nvme.c
+++ b/drivers/scsi/qla2xxx/qla_nvme.c
@@ -272,17 +272,6 @@ static void qla_nvme_fcp_abort(struct nvme_fc_local_port *lport,
 	schedule_work(&priv->abort_work);
 }
 
-static void qla_nvme_poll(struct nvme_fc_local_port *lport, void *hw_queue_handle)
-{
-	struct qla_qpair *qpair = hw_queue_handle;
-	unsigned long flags;
-	struct scsi_qla_host *vha = lport->private;
-
-	spin_lock_irqsave(&qpair->qp_lock, flags);
-	qla24xx_process_response_queue(vha, qpair->rsp);
-	spin_unlock_irqrestore(&qpair->qp_lock, flags);
-}
-
 static inline int qla2x00_start_nvme_mq(srb_t *sp)
 {
 	unsigned long   flags;
@@ -578,7 +567,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
 	.ls_abort	= qla_nvme_ls_abort,
 	.fcp_io		= qla_nvme_post_cmd,
 	.fcp_abort	= qla_nvme_fcp_abort,
-	.poll_queue	= qla_nvme_poll,
 	.max_hw_queues  = 8,
 	.max_sgl_segments = 128,
 	.max_dif_sgl_segments = 64,
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index 496ff759f84c..f4ab3b1925ac 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -403,7 +403,6 @@ struct nvme_fc_port_template {
 				void **handle);
 	void	(*delete_queue)(struct nvme_fc_local_port *,
 				unsigned int qidx, void *handle);
-	void	(*poll_queue)(struct nvme_fc_local_port *, void *handle);
 	int	(*ls_req)(struct nvme_fc_local_port *,
 				struct nvme_fc_remote_port *,
 				struct nvmefc_ls_req *);

From 23464f8c3407b83106463999b64fe10dc66ff6a3 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:33 +0900
Subject: [PATCH 144/351] aio: Comment use of IOCB_FLAG_IOPRIO aio flag

Comment the use of the IOCB_FLAG_IOPRIO aio flag similarly to the
IOCB_FLAG_RESFD flag.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/aio_abi.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index ce43d340f010..8387e0af0f76 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -50,6 +50,8 @@ enum {
  *
  * IOCB_FLAG_RESFD - Set if the "aio_resfd" member of the "struct iocb"
  *                   is valid.
+ * IOCB_FLAG_IOPRIO - Set if the "aio_reqprio" member of the "struct iocb"
+ *                    is valid.
  */
 #define IOCB_FLAG_RESFD		(1 << 0)
 #define IOCB_FLAG_IOPRIO	(1 << 1)

From e2b3fa5af70c1e646270f6c7c799414f5e904d7a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:34 +0900
Subject: [PATCH 145/351] block: Remove bio->bi_ioc

bio->bi_ioc is never set so always NULL. Remove references to it in
bio_disassociate_task() and in rq_ioc() and delete this field from
struct bio. With this change, rq_ioc() always returns
current->io_context without the need for a bio argument. Further
simplify the code and make it more readable by also removing this
helper, which also allows to simplify blk_mq_sched_assign_ioc() by
removing its bio argument.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               |  4 ----
 block/blk-core.c          |  2 +-
 block/blk-mq-sched.c      |  4 ++--
 block/blk-mq-sched.h      |  2 +-
 block/blk-mq.c            |  4 ++--
 block/blk.h               | 16 ----------------
 include/linux/blk_types.h |  3 +--
 7 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4f4d9884443b..03895cc0d74a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2027,10 +2027,6 @@ int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
  */
 void bio_disassociate_task(struct bio *bio)
 {
-	if (bio->bi_ioc) {
-		put_io_context(bio->bi_ioc);
-		bio->bi_ioc = NULL;
-	}
 	if (bio->bi_css) {
 		css_put(bio->bi_css);
 		bio->bi_css = NULL;
diff --git a/block/blk-core.c b/block/blk-core.c
index d6e8ab9ca99d..492648c96992 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -813,7 +813,7 @@ out:
 
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
-	struct io_context *ioc = rq_ioc(bio);
+	struct io_context *ioc = current->io_context;
 
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index d084f731d104..13b8dc332541 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -31,10 +31,10 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 
-void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
+void blk_mq_sched_assign_ioc(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	struct io_context *ioc = rq_ioc(bio);
+	struct io_context *ioc = current->io_context;
 	struct io_cq *icq;
 
 	spin_lock_irq(&q->queue_lock);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 7ff5671bf128..0f719c8532ae 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -8,7 +8,7 @@
 void blk_mq_sched_free_hctx_data(struct request_queue *q,
 				 void (*exit)(struct blk_mq_hw_ctx *));
 
-void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio);
+void blk_mq_sched_assign_ioc(struct request *rq);
 
 void blk_mq_sched_request_inserted(struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 52b1c97cd7c6..174384eaace7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -389,8 +389,8 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	if (!op_is_flush(data->cmd_flags)) {
 		rq->elv.icq = NULL;
 		if (e && e->type->ops.prepare_request) {
-			if (e->type->icq_cache && rq_ioc(bio))
-				blk_mq_sched_assign_ioc(rq, bio);
+			if (e->type->icq_cache)
+				blk_mq_sched_assign_ioc(rq);
 
 			e->type->ops.prepare_request(rq, bio);
 			rq->rq_flags |= RQF_ELVPRIV;
diff --git a/block/blk.h b/block/blk.h
index 816a9abb87cd..610948157a5b 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -254,22 +254,6 @@ void ioc_clear_queue(struct request_queue *q);
 
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
-/**
- * rq_ioc - determine io_context for request allocation
- * @bio: request being allocated is for this bio (can be %NULL)
- *
- * Determine io_context to use for request allocation for @bio.  May return
- * %NULL if %current->io_context doesn't exist.
- */
-static inline struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
-	if (bio && bio->bi_ioc)
-		return bio->bi_ioc;
-#endif
-	return current->io_context;
-}
-
 /**
  * create_io_context - try to create task->io_context
  * @gfp_mask: allocation mask
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dbdbfbd6a987..c0ba1a038ff3 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,10 +174,9 @@ struct bio {
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
-	 * Optional ioc and css associated with this bio.  Put on bio
+	 * Optional css associated with this bio.  Put on bio
 	 * release.  Read comment on top of bio_associate_current().
 	 */
-	struct io_context	*bi_ioc;
 	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
 	struct bio_issue	bi_issue;

From 64845a1ddd655574886eb48e9a5eaeeb9b05bf0d Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:35 +0900
Subject: [PATCH 146/351] block: Introduce get_current_ioprio()

Define get_current_ioprio() as an inline helper to obtain the caller
I/O priority from its task I/O context. Use this helper in
blk_init_request_from_bio() to set a request ioprio.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  6 +-----
 include/linux/ioprio.h | 13 +++++++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 492648c96992..4450d3c08f25 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -813,18 +813,14 @@ out:
 
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
-	struct io_context *ioc = current->io_context;
-
 	if (bio->bi_opf & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->__sector = bio->bi_iter.bi_sector;
 	if (ioprio_valid(bio_prio(bio)))
 		req->ioprio = bio_prio(bio);
-	else if (ioc)
-		req->ioprio = ioc->ioprio;
 	else
-		req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+		req->ioprio = get_current_ioprio();
 	req->write_hint = bio->bi_write_hint;
 	blk_rq_bio_prep(req->q, req, bio);
 }
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 9e30ed6443db..e9bfe6972aed 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -70,6 +70,19 @@ static inline int task_nice_ioclass(struct task_struct *task)
 		return IOPRIO_CLASS_BE;
 }
 
+/*
+ * If the calling process has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ */
+static inline int get_current_ioprio(void)
+{
+	struct io_context *ioc = current->io_context;
+
+	if (ioc)
+		return ioc->ioprio;
+	return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+}
+
 /*
  * For inheritance, return the highest of the two given priorities
  */

From 76dc891395dc61e92e2ff31b6161815ce5eb715b Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:36 +0900
Subject: [PATCH 147/351] aio: Fix fallback I/O priority value

For cases when the application does not specify aio_reqprio for an aio,
fallback to use get_current_ioprio() to obtain the task I/O priority
last set using ioprio_set() rather than the hardcoded IOPRIO_CLASS_NONE
value.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/aio.c b/fs/aio.c
index 301e6314183b..b984918be4b7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1441,7 +1441,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 
 		req->ki_ioprio = iocb->aio_reqprio;
 	} else
-		req->ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+		req->ki_ioprio = get_current_ioprio();
 
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))

From 668ffc03418bc779f699797c72ecf968cd6525a9 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:37 +0900
Subject: [PATCH 148/351] block: prevent merging of requests with different
 priorities

Growing in size a high priority request by merging it with a lower
priority BIO or request will increase the request execution time. This
is the opposite result of the desired effect of high I/O priorities,
namely getting low I/O latencies. Prevent merging of requests and BIOs
that have different I/O priorities to fix this.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c  | 3 ---
 block/blk-merge.c | 8 ++++++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 4450d3c08f25..dde30b08aa14 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -662,7 +662,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
-	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	blk_account_io_start(req, false);
 	return true;
@@ -686,7 +685,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 
 	req->__sector = bio->bi_iter.bi_sector;
 	req->__data_len += bio->bi_iter.bi_size;
-	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 
 	blk_account_io_start(req, false);
 	return true;
@@ -706,7 +704,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
-	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 	req->nr_phys_segments = segments + 1;
 
 	blk_account_io_start(req, false);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b1df622cbd85..6be04ef8da5b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -752,6 +752,9 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (req->write_hint != next->write_hint)
 		return NULL;
 
+	if (req->ioprio != next->ioprio)
+		return NULL;
+
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
@@ -807,8 +810,6 @@ static struct request *attempt_merge(struct request_queue *q,
 	 */
 	blk_account_io_merge(next);
 
-	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
-
 	/*
 	 * ownership of bio passed from next to req, return 'next' for
 	 * the caller to free
@@ -883,6 +884,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (rq->write_hint != bio->bi_write_hint)
 		return false;
 
+	if (rq->ioprio != bio_prio(bio))
+		return false;
+
 	return true;
 }
 

From 20578bdfd0418efb11ec316229e670d085cd574a Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Tue, 20 Nov 2018 10:52:38 +0900
Subject: [PATCH 149/351] block: Initialize BIO I/O priority early

For the synchronous I/O path case (read(), write() etc system calls), a
BIO I/O priority is not initialized until the execution of
blk_init_request_from_bio() when the BIO is submitted and a request
initialized for the BIO execution. This is due to the ki_ioprio field of
the struct kiocb defined on stack being always initialized to
IOPRIO_CLASS_NONE, regardless of the calling process I/O context ioprio
value set with ioprio_set(). This late initialization can result in the
BIO being merged to pending requests even when the I/O priorities
differ.

Fix this by initializing the ki_iopriority field of on stack struct
kiocb using the get_current_ioprio() helper, ensuring that all BIOs
allocated and submitted for the system call execution see the correct
intended I/O priority early. With this, since a BIO I/O priority is
always set to the intended effective value for both the sync and async
path, blk_init_request_from_bio() can be simplified.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Adam Manzanares <adam.manzanares@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c   | 5 +----
 include/linux/fs.h | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index dde30b08aa14..04f5be473638 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -814,10 +814,7 @@ void blk_init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 
 	req->__sector = bio->bi_iter.bi_sector;
-	if (ioprio_valid(bio_prio(bio)))
-		req->ioprio = bio_prio(bio);
-	else
-		req->ioprio = get_current_ioprio();
+	req->ioprio = bio_prio(bio);
 	req->write_hint = bio->bi_write_hint;
 	blk_rq_bio_prep(req->q, req, bio);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c95c0807471f..a1ab233e6469 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2021,7 +2021,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 		.ki_filp = filp,
 		.ki_flags = iocb_flags(filp),
 		.ki_hint = ki_hint_validate(file_write_hint(filp)),
-		.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0),
+		.ki_ioprio = get_current_ioprio(),
 	};
 }
 

From 0c62bff1fd633774756be6d88d71002cd37615e0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 20 Nov 2018 19:12:46 -0700
Subject: [PATCH 150/351] block: fix attempt to assign NULL io_context

If the first request allocated and issued by a process is a passhthrough
request, we don't set up an IO context for it. Ensure that
blk_mq_sched_assign_ioc() ignores a NULL io_context.

Fixes: e2b3fa5af70c ("block: Remove bio->bi_ioc")
Reported-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 13b8dc332541..f096d8989773 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -34,9 +34,16 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
 void blk_mq_sched_assign_ioc(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	struct io_context *ioc = current->io_context;
+	struct io_context *ioc;
 	struct io_cq *icq;
 
+	/*
+	 * May not have an IO context if it's a passthrough request
+	 */
+	ioc = current->io_context;
+	if (!ioc)
+		return;
+
 	spin_lock_irq(&q->queue_lock);
 	icq = ioc_lookup_icq(ioc, q);
 	spin_unlock_irq(&q->queue_lock);

From 1db4909e76f64a85f4aaa187f0f683f5c85a471d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 20 Nov 2018 09:44:35 +0800
Subject: [PATCH 151/351] blk-mq: not embed .mq_kobj and ctx->kobj into queue
 instance

Even though .mq_kobj, ctx->kobj and q->kobj share same lifetime
from block layer's view, actually they don't because userspace may
grab one kobject anytime via sysfs.

This patch fixes the issue by the following approach:

1) introduce 'struct blk_mq_ctxs' for holding .mq_kobj and managing
all ctxs

2) free all allocated ctxs and the 'blk_mq_ctxs' instance in release
handler of .mq_kobj

3) grab one ref of .mq_kobj before initializing each ctx->kobj, so that
.mq_kobj is always released after all ctxs are freed.

This patch fixes kernel panic issue during booting when DEBUG_KOBJECT_RELEASE
is enabled.

Reported-by: Guenter Roeck <linux@roeck-us.net>
Cc: "jianchao.wang" <jianchao.w.wang@oracle.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sysfs.c   | 34 ++++++++++++++++++++++++----------
 block/blk-mq.c         | 39 ++++++++++++++++++++++++++++++++-------
 block/blk-mq.h         |  6 ++++++
 include/linux/blkdev.h |  2 +-
 4 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 3d25b9c419e9..6efef1f679f0 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -15,6 +15,18 @@
 
 static void blk_mq_sysfs_release(struct kobject *kobj)
 {
+	struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj);
+
+	free_percpu(ctxs->queue_ctx);
+	kfree(ctxs);
+}
+
+static void blk_mq_ctx_sysfs_release(struct kobject *kobj)
+{
+	struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj);
+
+	/* ctx->ctxs won't be released until all ctx are freed */
+	kobject_put(&ctx->ctxs->kobj);
 }
 
 static void blk_mq_hw_sysfs_release(struct kobject *kobj)
@@ -213,7 +225,7 @@ static struct kobj_type blk_mq_ktype = {
 static struct kobj_type blk_mq_ctx_ktype = {
 	.sysfs_ops	= &blk_mq_sysfs_ops,
 	.default_attrs	= default_ctx_attrs,
-	.release	= blk_mq_sysfs_release,
+	.release	= blk_mq_ctx_sysfs_release,
 };
 
 static struct kobj_type blk_mq_hw_ktype = {
@@ -245,7 +257,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
 	if (!hctx->nr_ctx)
 		return 0;
 
-	ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num);
+	ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num);
 	if (ret)
 		return ret;
 
@@ -268,8 +280,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_unregister_hctx(hctx);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 
 	q->mq_sysfs_init_done = false;
@@ -289,7 +301,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q)
 		ctx = per_cpu_ptr(q->queue_ctx, cpu);
 		kobject_put(&ctx->kobj);
 	}
-	kobject_put(&q->mq_kobj);
+	kobject_put(q->mq_kobj);
 }
 
 void blk_mq_sysfs_init(struct request_queue *q)
@@ -297,10 +309,12 @@ void blk_mq_sysfs_init(struct request_queue *q)
 	struct blk_mq_ctx *ctx;
 	int cpu;
 
-	kobject_init(&q->mq_kobj, &blk_mq_ktype);
+	kobject_init(q->mq_kobj, &blk_mq_ktype);
 
 	for_each_possible_cpu(cpu) {
 		ctx = per_cpu_ptr(q->queue_ctx, cpu);
+
+		kobject_get(q->mq_kobj);
 		kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
 	}
 }
@@ -313,11 +327,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
 	WARN_ON_ONCE(!q->kobj.parent);
 	lockdep_assert_held(&q->sysfs_lock);
 
-	ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+	ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
 	if (ret < 0)
 		goto out;
 
-	kobject_uevent(&q->mq_kobj, KOBJ_ADD);
+	kobject_uevent(q->mq_kobj, KOBJ_ADD);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_register_hctx(hctx);
@@ -334,8 +348,8 @@ unreg:
 	while (--i >= 0)
 		blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
 
-	kobject_uevent(&q->mq_kobj, KOBJ_REMOVE);
-	kobject_del(&q->mq_kobj);
+	kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+	kobject_del(q->mq_kobj);
 	kobject_put(&dev->kobj);
 	return ret;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 174384eaace7..b16204df65d1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2515,6 +2515,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 	mutex_unlock(&set->tag_list_lock);
 }
 
+/* All allocations will be freed in release handler of q->mq_kobj */
+static int blk_mq_alloc_ctxs(struct request_queue *q)
+{
+	struct blk_mq_ctxs *ctxs;
+	int cpu;
+
+	ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
+	if (!ctxs)
+		return -ENOMEM;
+
+	ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
+	if (!ctxs->queue_ctx)
+		goto fail;
+
+	for_each_possible_cpu(cpu) {
+		struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
+		ctx->ctxs = ctxs;
+	}
+
+	q->mq_kobj = &ctxs->kobj;
+	q->queue_ctx = ctxs->queue_ctx;
+
+	return 0;
+ fail:
+	kfree(ctxs);
+	return -ENOMEM;
+}
+
 /*
  * It is the actual release handler for mq, but we do it from
  * request queue's release handler for avoiding use-after-free
@@ -2540,8 +2568,6 @@ void blk_mq_release(struct request_queue *q)
 	 * both share lifetime with request queue.
 	 */
 	blk_mq_sysfs_deinit(q);
-
-	free_percpu(q->queue_ctx);
 }
 
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
@@ -2731,8 +2757,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	if (!q->poll_cb)
 		goto err_exit;
 
-	q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
-	if (!q->queue_ctx)
+	if (blk_mq_alloc_ctxs(q))
 		goto err_exit;
 
 	/* init q->mq_kobj and sw queues' kobjects */
@@ -2742,7 +2767,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
 						GFP_KERNEL, set->numa_node);
 	if (!q->queue_hw_ctx)
-		goto err_percpu;
+		goto err_sys_init;
 
 	blk_mq_realloc_hw_ctxs(set, q);
 	if (!q->nr_hw_queues)
@@ -2794,8 +2819,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 
 err_hctxs:
 	kfree(q->queue_hw_ctx);
-err_percpu:
-	free_percpu(q->queue_ctx);
+err_sys_init:
+	blk_mq_sysfs_deinit(q);
 err_exit:
 	q->mq_ops = NULL;
 	return ERR_PTR(-ENOMEM);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index facb6e9ddce4..9ae8e9f8f8b1 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -7,6 +7,11 @@
 
 struct blk_mq_tag_set;
 
+struct blk_mq_ctxs {
+	struct kobject kobj;
+	struct blk_mq_ctx __percpu	*queue_ctx;
+};
+
 /**
  * struct blk_mq_ctx - State for a software queue facing the submitting CPUs
  */
@@ -27,6 +32,7 @@ struct blk_mq_ctx {
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
 
 	struct request_queue	*queue;
+	struct blk_mq_ctxs      *ctxs;
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e97c0a3b2262..9b53db06ad08 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -456,7 +456,7 @@ struct request_queue {
 	/*
 	 * mq queue kobject
 	 */
-	struct kobject mq_kobj;
+	struct kobject *mq_kobj;
 
 #ifdef  CONFIG_BLK_DEV_INTEGRITY
 	struct blk_integrity integrity;

From 1052b8ac5282daf35df331edcbdb645839d17e6a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:21:49 -0700
Subject: [PATCH 152/351] blk-mq: when polling for IO, look for any completion

If we want to support async IO polling, then we have to allow finding
completions that aren't just for the one we are looking for. Always pass
in -1 to the mq_ops->poll() helper, and have that return how many events
were found in this poll loop.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c         | 13 ++++++--
 block/blk-mq.c           | 71 ++++++++++++++++++++--------------------
 drivers/nvme/host/pci.c  | 14 ++++----
 drivers/nvme/host/rdma.c | 39 +++++++++-------------
 include/linux/blkdev.h   |  2 +-
 5 files changed, 70 insertions(+), 69 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 04f5be473638..03c4202b69bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1273,10 +1273,19 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie)
+/**
+ * blk_poll - poll for IO completions
+ * @q:  the queue
+ * @cookie: cookie passed back at IO submission time
+ *
+ * Description:
+ *    Poll for completions on the passed in queue. Returns number of
+ *    completed entries found.
+ */
+int blk_poll(struct request_queue *q, blk_qc_t cookie)
 {
 	if (!q->poll_fn || !blk_qc_t_valid(cookie))
-		return false;
+		return 0;
 
 	if (current->plug)
 		blk_flush_plug_list(current->plug, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b16204df65d1..ec6c79578332 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3285,15 +3285,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 		return false;
 
 	/*
-	 * poll_nsec can be:
+	 * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
 	 *
-	 * -1:	don't ever hybrid sleep
 	 *  0:	use half of prev avg
 	 * >0:	use this specific value
 	 */
-	if (q->poll_nsec == -1)
-		return false;
-	else if (q->poll_nsec > 0)
+	if (q->poll_nsec > 0)
 		nsecs = q->poll_nsec;
 	else
 		nsecs = blk_mq_poll_nsecs(q, hctx, rq);
@@ -3330,11 +3327,41 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
 	return true;
 }
 
-static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static bool blk_mq_poll_hybrid(struct request_queue *q,
+			       struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
 {
-	struct request_queue *q = hctx->queue;
+	struct request *rq;
+
+	if (q->poll_nsec == -1)
+		return false;
+
+	if (!blk_qc_t_is_internal(cookie))
+		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
+	else {
+		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
+		/*
+		 * With scheduling, if the request has completed, we'll
+		 * get a NULL return here, as we clear the sched tag when
+		 * that happens. The request still remains valid, like always,
+		 * so we should be safe with just the NULL check.
+		 */
+		if (!rq)
+			return false;
+	}
+
+	return blk_mq_poll_hybrid_sleep(q, hctx, rq);
+}
+
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+{
+	struct blk_mq_hw_ctx *hctx;
 	long state;
 
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		return 0;
+
+	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
+
 	/*
 	 * If we sleep, have the caller restart the poll loop to reset
 	 * the state. Like for the other success return cases, the
@@ -3342,7 +3369,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	 * the IO isn't complete, we'll get called again and will go
 	 * straight to the busy poll loop.
 	 */
-	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
+	if (blk_mq_poll_hybrid(q, hctx, cookie))
 		return 1;
 
 	hctx->poll_considered++;
@@ -3353,7 +3380,7 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 
 		hctx->poll_invoked++;
 
-		ret = q->mq_ops->poll(hctx, rq->tag);
+		ret = q->mq_ops->poll(hctx, -1U);
 		if (ret > 0) {
 			hctx->poll_success++;
 			__set_current_state(TASK_RUNNING);
@@ -3374,32 +3401,6 @@ static int __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return 0;
 }
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
-{
-	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
-
-	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return 0;
-
-	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-	if (!blk_qc_t_is_internal(cookie))
-		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
-	else {
-		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
-		/*
-		 * With scheduling, if the request has completed, we'll
-		 * get a NULL return here, as we clear the sched tag when
-		 * that happens. The request still remains valid, like always,
-		 * so we should be safe with just the NULL check.
-		 */
-		if (!rq)
-			return 0;
-	}
-
-	return __blk_mq_poll(hctx, rq);
-}
-
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
 	return rq->mq_ctx->cpu;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 57e790391b82..de50d80ecc84 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1012,15 +1012,15 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
 	}
 }
 
-static inline bool nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
-		u16 *end, int tag)
+static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
+				  u16 *end, unsigned int tag)
 {
-	bool found = false;
+	int found = 0;
 
 	*start = nvmeq->cq_head;
-	while (!found && nvme_cqe_pending(nvmeq)) {
-		if (nvmeq->cqes[nvmeq->cq_head].command_id == tag)
-			found = true;
+	while (nvme_cqe_pending(nvmeq)) {
+		if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+			found++;
 		nvme_update_cq_head(nvmeq);
 	}
 	*end = nvmeq->cq_head;
@@ -1062,7 +1062,7 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 {
 	u16 start, end;
-	bool found;
+	int found;
 
 	if (!nvme_cqe_pending(nvmeq))
 		return 0;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d181cafedc58..c2c3e1a5b7af 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1409,12 +1409,11 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
 	WARN_ON_ONCE(ret);
 }
 
-static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
-		struct nvme_completion *cqe, struct ib_wc *wc, int tag)
+static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
+		struct nvme_completion *cqe, struct ib_wc *wc)
 {
 	struct request *rq;
 	struct nvme_rdma_request *req;
-	int ret = 0;
 
 	rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
 	if (!rq) {
@@ -1422,7 +1421,7 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			"tag 0x%x on QP %#x not found\n",
 			cqe->command_id, queue->qp->qp_num);
 		nvme_rdma_error_recovery(queue->ctrl);
-		return ret;
+		return;
 	}
 	req = blk_mq_rq_to_pdu(rq);
 
@@ -1437,6 +1436,8 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			nvme_rdma_error_recovery(queue->ctrl);
 		}
 	} else if (req->mr) {
+		int ret;
+
 		ret = nvme_rdma_inv_rkey(queue, req);
 		if (unlikely(ret < 0)) {
 			dev_err(queue->ctrl->ctrl.device,
@@ -1445,19 +1446,14 @@ static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 			nvme_rdma_error_recovery(queue->ctrl);
 		}
 		/* the local invalidation completion will end the request */
-		return 0;
+		return;
 	}
 
-	if (refcount_dec_and_test(&req->ref)) {
-		if (rq->tag == tag)
-			ret = 1;
+	if (refcount_dec_and_test(&req->ref))
 		nvme_end_request(rq, req->status, req->result);
-	}
-
-	return ret;
 }
 
-static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
+static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct nvme_rdma_qe *qe =
 		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
@@ -1465,11 +1461,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 	struct ib_device *ibdev = queue->device->dev;
 	struct nvme_completion *cqe = qe->data;
 	const size_t len = sizeof(struct nvme_completion);
-	int ret = 0;
 
 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		nvme_rdma_wr_error(cq, wc, "RECV");
-		return 0;
+		return;
 	}
 
 	ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
@@ -1484,16 +1479,10 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
 		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 				&cqe->result);
 	else
-		ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
+		nvme_rdma_process_nvme_rsp(queue, cqe, wc);
 	ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
 
 	nvme_rdma_post_recv(queue, qe);
-	return ret;
-}
-
-static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
-{
-	__nvme_rdma_recv_done(cq, wc, -1);
 }
 
 static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
@@ -1758,10 +1747,12 @@ static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 		struct ib_cqe *cqe = wc.wr_cqe;
 
 		if (cqe) {
-			if (cqe->done == nvme_rdma_recv_done)
-				found |= __nvme_rdma_recv_done(cq, &wc, tag);
-			else
+			if (cqe->done == nvme_rdma_recv_done) {
+				nvme_rdma_recv_done(cq, &wc);
+				found++;
+			} else {
 				cqe->done(cq, &wc);
+			}
 		}
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9b53db06ad08..f3015e9b5ae3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-bool blk_poll(struct request_queue *q, blk_qc_t cookie);
+int blk_poll(struct request_queue *q, blk_qc_t cookie);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {

From 9743139c5d11ab170f70a308dcb88c342390adfb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 Nov 2018 09:48:21 -0700
Subject: [PATCH 153/351] blk-mq: remove 'tag' parameter from mq_ops->poll()

We always pass in -1 now and none of the callers use the tag value,
remove the parameter.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c           | 2 +-
 drivers/nvme/host/pci.c  | 8 ++++----
 drivers/nvme/host/rdma.c | 2 +-
 include/linux/blk-mq.h   | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec6c79578332..b66cca3ce1e5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3380,7 +3380,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 		hctx->poll_invoked++;
 
-		ret = q->mq_ops->poll(hctx, -1U);
+		ret = q->mq_ops->poll(hctx);
 		if (ret > 0) {
 			hctx->poll_success++;
 			__set_current_state(TASK_RUNNING);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index de50d80ecc84..73effe586e5f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1075,14 +1075,14 @@ static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 	return found;
 }
 
-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 
-	return __nvme_poll(nvmeq, tag);
+	return __nvme_poll(nvmeq, -1);
 }
 
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	u16 start, end;
@@ -1092,7 +1092,7 @@ static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx, unsigned int tag)
 		return 0;
 
 	spin_lock(&nvmeq->cq_lock);
-	found = nvme_process_cq(nvmeq, &start, &end, tag);
+	found = nvme_process_cq(nvmeq, &start, &end, -1);
 	spin_unlock(&nvmeq->cq_lock);
 
 	nvme_complete_cqes(nvmeq, start, end);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c2c3e1a5b7af..ccfde6c7c0a5 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1736,7 +1736,7 @@ err:
 	return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_rdma_queue *queue = hctx->driver_data;
 	struct ib_cq *cq = queue->ib_cq;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 929e8abc5535..ca0520ca6437 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -132,7 +132,7 @@ typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *,
 typedef bool (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
 typedef bool (busy_tag_iter_fn)(struct request *, void *, bool);
-typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (poll_fn)(struct blk_mq_hw_ctx *);
 typedef int (map_queues_fn)(struct blk_mq_tag_set *set);
 typedef bool (busy_fn)(struct request_queue *);
 typedef void (complete_fn)(struct request *);

From e7d943910719b44738e86f91a26a64e3b61ae419 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 19 Nov 2018 08:29:33 -0700
Subject: [PATCH 154/351] nvme: remove opportunistic polling from bdev target

It doesn't set HIPRI on the bio, so polling for it is pretty silly.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd-bdev.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1ec3475a140..c1cb2ed5531c 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -115,8 +115,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	}
 
 	cookie = submit_bio(bio);
-
-	blk_poll(bdev_get_queue(req->ns->bdev), cookie);
 }
 
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)

From 0a1b8b87d064a47fad9ec475316002da28559207 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 08:24:43 -0700
Subject: [PATCH 155/351] block: make blk_poll() take a parameter on whether to
 spin or not

blk_poll() has always kept spinning until it found an IO. This is
fine for SYNC polling, since we need to find one request we have
pending, but in preparation for ASYNC polling it can be beneficial
to just check if we have any entries available or not.

Existing callers are converted to pass in 'spin == true', to retain
the old behavior.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c              | 9 ++++++---
 block/blk-mq.c                | 6 +++---
 drivers/nvme/host/multipath.c | 4 ++--
 fs/block_dev.c                | 4 ++--
 fs/direct-io.c                | 2 +-
 fs/iomap.c                    | 2 +-
 include/linux/blkdev.h        | 4 ++--
 mm/page_io.c                  | 2 +-
 8 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 03c4202b69bf..9af56dbb84f1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1277,19 +1277,22 @@ EXPORT_SYMBOL(submit_bio);
  * blk_poll - poll for IO completions
  * @q:  the queue
  * @cookie: cookie passed back at IO submission time
+ * @spin: whether to spin for completions
  *
  * Description:
  *    Poll for completions on the passed in queue. Returns number of
- *    completed entries found.
+ *    completed entries found. If @spin is true, then blk_poll will continue
+ *    looping until at least one completion is found, unless the task is
+ *    otherwise marked running (or we need to reschedule).
  */
-int blk_poll(struct request_queue *q, blk_qc_t cookie)
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
 	if (!q->poll_fn || !blk_qc_t_valid(cookie))
 		return 0;
 
 	if (current->plug)
 		blk_flush_plug_list(current->plug, false);
-	return q->poll_fn(q, cookie);
+	return q->poll_fn(q, cookie, spin);
 }
 EXPORT_SYMBOL_GPL(blk_poll);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b66cca3ce1e5..c2751f0a3ccc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,7 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -3352,7 +3352,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
 	return blk_mq_poll_hybrid_sleep(q, hctx, rq);
 }
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
+static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
 	struct blk_mq_hw_ctx *hctx;
 	long state;
@@ -3392,7 +3392,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 
 		if (current->state == TASK_RUNNING)
 			return 1;
-		if (ret < 0)
+		if (ret < 0 || !spin)
 			break;
 		cpu_relax();
 	}
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f9eeb3b58632..ffebdd0ae34b 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,7 +220,7 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
+static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
 {
 	struct nvme_ns_head *head = q->queuedata;
 	struct nvme_ns *ns;
@@ -230,7 +230,7 @@ static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 	srcu_idx = srcu_read_lock(&head->srcu);
 	ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
 	if (likely(ns && nvme_path_is_optimized(ns)))
-		found = ns->queue->poll_fn(q, qc);
+		found = ns->queue->poll_fn(q, qc, spin);
 	srcu_read_unlock(&head->srcu, srcu_idx);
 	return found;
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 64ba27b8b754..d233a59ea364 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -243,7 +243,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -423,7 +423,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 			break;
 
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(bdev), qc))
+		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index ea07d5a34317..a5a4e5a1423e 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -518,7 +518,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie))
+		    !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
diff --git a/fs/iomap.c b/fs/iomap.c
index c5df035ace6f..74c1f37f0fd6 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1896,7 +1896,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			if (!(iocb->ki_flags & IOCB_HIPRI) ||
 			    !dio->submit.last_queue ||
 			    !blk_poll(dio->submit.last_queue,
-					 dio->submit.cookie))
+					 dio->submit.cookie, true))
 				io_schedule();
 		}
 		__set_current_state(TASK_RUNNING);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f3015e9b5ae3..e3c0a8ec16a7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,7 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t);
+typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
@@ -867,7 +867,7 @@ extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
-int blk_poll(struct request_queue *q, blk_qc_t cookie);
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
diff --git a/mm/page_io.c b/mm/page_io.c
index a7271fa481f6..5bdfd21c1bd9 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -410,7 +410,7 @@ int swap_readpage(struct page *page, bool synchronous)
 		if (!READ_ONCE(bio->bi_private))
 			break;
 
-		if (!blk_poll(disk->queue, qc))
+		if (!blk_poll(disk->queue, qc, true))
 			break;
 	}
 	__set_current_state(TASK_RUNNING);

From aa61bec30eca11816789dc25c2090366b0ccfaf8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 13 Nov 2018 21:32:10 -0700
Subject: [PATCH 156/351] blk-mq: ensure mq_ops ->poll() is entered at least
 once

Right now we immediately bail if need_resched() is true, but
we need to do at least one loop in case we have entries waiting.
So just invert the need_resched() check, putting it at the
bottom of the loop.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c2751f0a3ccc..ba3c7b6476b7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3375,7 +3375,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 	hctx->poll_considered++;
 
 	state = current->state;
-	while (!need_resched()) {
+	do {
 		int ret;
 
 		hctx->poll_invoked++;
@@ -3395,7 +3395,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 		if (ret < 0 || !spin)
 			break;
 		cpu_relax();
-	}
+	} while (!need_resched());
 
 	__set_current_state(TASK_RUNNING);
 	return 0;

From 4ab32bf3305eedb4d31f85cac68a67becab10494 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 18 Nov 2018 16:15:35 -0700
Subject: [PATCH 157/351] blk-mq: never redirect polled IO completions

It's pointless to do so, we are by definition on the CPU we want/need
to be, as that's the one waiting for a completion event.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ba3c7b6476b7..37674c1766a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -585,7 +585,12 @@ static void __blk_mq_complete_request(struct request *rq)
 		return;
 	}
 
-	if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
+	/*
+	 * For a polled request, always complete locallly, it's pointless
+	 * to redirect the completion.
+	 */
+	if ((rq->cmd_flags & REQ_HIPRI) ||
+	    !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
 		q->mq_ops->complete(rq);
 		return;
 	}

From 16c15eb16a793f2d81ae52f41f43fb6831b34212 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:28 -0700
Subject: [PATCH 158/351] blk-mq: Return true if request was completed

A driver may have internal state to cleanup if we're pretending a request
didn't complete. Return 'false' if the command wasn't actually completed
due to the timeout error injection, and true otherwise.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 5 +++--
 include/linux/blk-mq.h | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 37674c1766a7..7c8cfa0cd420 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -638,11 +638,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
-void blk_mq_complete_request(struct request *rq)
+bool blk_mq_complete_request(struct request *rq)
 {
 	if (unlikely(blk_should_fake_timeout(rq->q)))
-		return;
+		return false;
 	__blk_mq_complete_request(rq);
+	return true;
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ca0520ca6437..6e3da356a8eb 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -298,7 +298,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
-void blk_mq_complete_request(struct request *rq);
+bool blk_mq_complete_request(struct request *rq);
 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
 			   struct bio *bio);
 bool blk_mq_queue_stopped(struct request_queue *q);

From f1342709d18af97b0e71449d5696b8873d1a456c Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:29 -0700
Subject: [PATCH 159/351] scsi: Do not rely on blk-mq for double completions

The scsi timeout error handling had been directly updating the block
layer's request state to prevent a error handling and a natural completion
from completing the same request twice. Fix this layering violation
by having scsi control the fate of its commands with scsi owned flags
rather than use blk-mq's.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_error.c | 22 +++++++++++-----------
 drivers/scsi/scsi_lib.c   | 13 ++++++++++++-
 include/scsi/scsi_cmnd.h  |  4 ++++
 3 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index dd338a8cd275..16eef068e9e9 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -297,19 +297,19 @@ enum blk_eh_timer_return scsi_times_out(struct request *req)
 
 	if (rtn == BLK_EH_DONE) {
 		/*
-		 * For blk-mq, we must set the request state to complete now
-		 * before sending the request to the scsi error handler. This
-		 * will prevent a use-after-free in the event the LLD manages
-		 * to complete the request before the error handler finishes
-		 * processing this timed out request.
+		 * Set the command to complete first in order to prevent a real
+		 * completion from releasing the command while error handling
+		 * is using it. If the command was already completed, then the
+		 * lower level driver beat the timeout handler, and it is safe
+		 * to return without escalating error recovery.
 		 *
-		 * If the request was already completed, then the LLD beat the
-		 * time out handler from transferring the request to the scsi
-		 * error handler. In that case we can return immediately as no
-		 * further action is required.
+		 * If timeout handling lost the race to a real completion, the
+		 * block layer may ignore that due to a fake timeout injection,
+		 * so return RESET_TIMER to allow error handling another shot
+		 * at this command.
 		 */
-		if (!blk_mq_mark_complete(req))
-			return rtn;
+		if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state))
+			return BLK_EH_RESET_TIMER;
 		if (scsi_abort_command(scmd) != SUCCESS) {
 			set_host_byte(scmd, DID_TIME_OUT);
 			scsi_eh_scmd_add(scmd);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 0df15cb738d2..0dbf25512778 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1642,8 +1642,18 @@ static blk_status_t scsi_mq_prep_fn(struct request *req)
 
 static void scsi_mq_done(struct scsi_cmnd *cmd)
 {
+	if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
+		return;
 	trace_scsi_dispatch_cmd_done(cmd);
-	blk_mq_complete_request(cmd->request);
+
+	/*
+	 * If the block layer didn't complete the request due to a timeout
+	 * injection, scsi must clear its internal completed state so that the
+	 * timeout handler will see it needs to escalate its own error
+	 * recovery.
+	 */
+	if (unlikely(!blk_mq_complete_request(cmd->request)))
+		clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
 }
 
 static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx)
@@ -1702,6 +1712,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!scsi_host_queue_ready(q, shost, sdev))
 		goto out_dec_target_busy;
 
+	clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
 	if (!(req->rq_flags & RQF_DONTPREP)) {
 		ret = scsi_mq_prep_fn(req);
 		if (ret != BLK_STS_OK)
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index d6fd2aba0380..3de905e205ce 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -61,6 +61,9 @@ struct scsi_pointer {
 /* flags preserved across unprep / reprep */
 #define SCMD_PRESERVED_FLAGS	(SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)
 
+/* for scmd->state */
+#define SCMD_STATE_COMPLETE	(1 << 0)
+
 struct scsi_cmnd {
 	struct scsi_request req;
 	struct scsi_device *device;
@@ -145,6 +148,7 @@ struct scsi_cmnd {
 
 	int result;		/* Status code from lower level driver */
 	int flags;		/* Command flags */
+	unsigned long state;	/* Command completion state */
 
 	unsigned char tag;	/* SCSI-II queued command tag */
 };

From af78ff7c6e66832afcdf5418f67b11c409f9e7a1 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 26 Nov 2018 09:54:30 -0700
Subject: [PATCH 160/351] blk-mq: Simplify request completion state

There are no more users relying on blk-mq request states to prevent
double completions, so replace the relatively expensive cmpxchg operation
with WRITE_ONCE.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         |  4 +---
 include/linux/blk-mq.h | 14 --------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7c8cfa0cd420..cda698804422 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -568,9 +568,7 @@ static void __blk_mq_complete_request(struct request *rq)
 	bool shared = false;
 	int cpu;
 
-	if (!blk_mq_mark_complete(rq))
-		return;
-
+	WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
 	/*
 	 * Most of single queue controllers, there is only one irq vector
 	 * for handling IO completion, and the only irq's affinity is set
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6e3da356a8eb..b8de11e0603b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -329,20 +329,6 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
-/**
- * blk_mq_mark_complete() - Set request state to complete
- * @rq: request to set to complete state
- *
- * Returns true if request state was successfully set to complete. If
- * successful, the caller is responsibile for seeing this request is ended, as
- * blk_mq_complete_request will not work again.
- */
-static inline bool blk_mq_mark_complete(struct request *rq)
-{
-	return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
-			MQ_RQ_IN_FLIGHT;
-}
-
 /*
  * Driver command data is immediately after the request. So subtract request
  * size to get back to the original request, add request size to get the PDU.

From 5f0ed774ed2914decfd397569fface997532e94d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 23 Nov 2018 22:04:33 -0700
Subject: [PATCH 161/351] block: sum requests in the plug structure

This isn't exactly the same as the previous count, as it includes
requests for all devices. But that really doesn't matter, if we have
more than the threshold (16) queued up, flush it. It's not worth it
to have an expensive list loop for this.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 30 ++++--------------------------
 block/blk-mq.c         | 16 +++++-----------
 block/blk.h            |  2 --
 include/linux/blkdev.h |  1 +
 4 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9af56dbb84f1..be9233400314 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -736,7 +736,6 @@ no_merge:
  * Caller must ensure !blk_queue_nomerges(q) beforehand.
  */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count,
 			    struct request **same_queue_rq)
 {
 	struct blk_plug *plug;
@@ -746,22 +745,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	plug = current->plug;
 	if (!plug)
 		return false;
-	*request_count = 0;
 
 	plug_list = &plug->mq_list;
 
 	list_for_each_entry_reverse(rq, plug_list, queuelist) {
 		bool merged = false;
 
-		if (rq->q == q) {
-			(*request_count)++;
+		if (rq->q == q && same_queue_rq) {
 			/*
 			 * Only blk-mq multiple hardware queues case checks the
 			 * rq in the same queue, there should be only one such
 			 * rq in a queue
 			 **/
-			if (same_queue_rq)
-				*same_queue_rq = rq;
+			*same_queue_rq = rq;
 		}
 
 		if (rq->q != q || !blk_rq_merge_ok(rq, bio))
@@ -788,26 +784,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 	return false;
 }
 
-unsigned int blk_plug_queued_count(struct request_queue *q)
-{
-	struct blk_plug *plug;
-	struct request *rq;
-	struct list_head *plug_list;
-	unsigned int ret = 0;
-
-	plug = current->plug;
-	if (!plug)
-		goto out;
-
-	plug_list = &plug->mq_list;
-	list_for_each_entry(rq, plug_list, queuelist) {
-		if (rq->q == q)
-			ret++;
-	}
-out:
-	return ret;
-}
-
 void blk_init_request_from_bio(struct request *req, struct bio *bio)
 {
 	if (bio->bi_opf & REQ_RAHEAD)
@@ -1803,6 +1779,8 @@ void blk_start_plug(struct blk_plug *plug)
 
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
+	plug->rq_count = 0;
+
 	/*
 	 * Store ordering should not be needed here, since a potential
 	 * preempt will imply a full memory barrier
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cda698804422..7b7dff85cf6c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1675,6 +1675,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	unsigned int depth;
 
 	list_splice_init(&plug->mq_list, &list);
+	plug->rq_count = 0;
 
 	list_sort(NULL, &list, plug_rq_cmp);
 
@@ -1871,7 +1872,6 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf };
 	struct request *rq;
-	unsigned int request_count = 0;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
@@ -1884,7 +1884,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		return BLK_QC_T_NONE;
 
 	if (!is_flush_fua && !blk_queue_nomerges(q) &&
-	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
+	    blk_attempt_plug_merge(q, bio, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
 	if (blk_mq_sched_bio_merge(q, bio))
@@ -1915,20 +1915,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		blk_insert_flush(rq);
 		blk_mq_run_hw_queue(data.hctx, true);
 	} else if (plug && q->nr_hw_queues == 1) {
+		unsigned int request_count = plug->rq_count;
 		struct request *last = NULL;
 
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
 
-		/*
-		 * @request_count may become stale because of schedule
-		 * out, so check the list again.
-		 */
-		if (list_empty(&plug->mq_list))
-			request_count = 0;
-		else if (blk_queue_nomerges(q))
-			request_count = blk_plug_queued_count(q);
-
 		if (!request_count)
 			trace_block_plug(q);
 		else
@@ -1941,6 +1933,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		}
 
 		list_add_tail(&rq->queuelist, &plug->mq_list);
+		plug->rq_count++;
 	} else if (plug && !blk_queue_nomerges(q)) {
 		blk_mq_bio_to_request(rq, bio);
 
@@ -1956,6 +1949,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		if (same_queue_rq)
 			list_del_init(&same_queue_rq->queuelist);
 		list_add_tail(&rq->queuelist, &plug->mq_list);
+		plug->rq_count++;
 
 		blk_mq_put_ctx(data.ctx);
 
diff --git a/block/blk.h b/block/blk.h
index 610948157a5b..848278c52030 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -161,9 +161,7 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
 		struct bio *bio);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-			    unsigned int *request_count,
 			    struct request **same_queue_rq);
-unsigned int blk_plug_queued_count(struct request_queue *q);
 
 void blk_account_io_start(struct request *req, bool new_io);
 void blk_account_io_completion(struct request *req, unsigned int bytes);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e3c0a8ec16a7..02732cae6080 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1130,6 +1130,7 @@ extern void blk_set_queue_dying(struct request_queue *);
 struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
+	unsigned short rq_count;
 };
 #define BLK_MAX_REQUEST_COUNT 16
 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)

From a11f6ca9aef989b56cd31ff4ee2af4fb31a172ec Mon Sep 17 00:00:00 2001
From: Young Xiao <YangX92@hotmail.com>
Date: Wed, 28 Nov 2018 12:36:39 +0000
Subject: [PATCH 162/351] sunvdc: Do not spin in an infinite loop when
 vio_ldc_send() returns EAGAIN

__vdc_tx_trigger should only loop on EAGAIN a finite
number of times.

See commit adddc32d6fde ("sunvnet: Do not spin in an
infinite loop when vio_ldc_send() returns EAGAIN") for detail.

Signed-off-by: Young Xiao <YangX92@hotmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/sunvdc.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 175a64619602..9c0553dd13e7 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -45,6 +45,8 @@ MODULE_VERSION(DRV_MODULE_VERSION);
 #define WAITING_FOR_GEN_CMD	0x04
 #define WAITING_FOR_ANY		-1
 
+#define	VDC_MAX_RETRIES	10
+
 static struct workqueue_struct *sunvdc_wq;
 
 struct vdc_req_entry {
@@ -431,6 +433,7 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		.end_idx		= dr->prod,
 	};
 	int err, delay;
+	int retries = 0;
 
 	hdr.seq = dr->snd_nxt;
 	delay = 1;
@@ -443,6 +446,8 @@ static int __vdc_tx_trigger(struct vdc_port *port)
 		udelay(delay);
 		if ((delay <<= 1) > 128)
 			delay = 128;
+		if (retries++ > VDC_MAX_RETRIES)
+			break;
 	} while (err == -EAGAIN);
 
 	if (err == -ENOTCONN)

From 4711b57317f0ff5ca9fbd5e2df6c73b2c07ddc53 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Nov 2018 17:07:17 -0700
Subject: [PATCH 163/351] blk-mq: fix failure to decrement plug count on single
 rq removal

If we yank a 'same_queue_rq' request off the plug list, we should
also decrement the cached request count.

Fixes: 5f0ed774ed29 ("block: sum requests in the plug structure")
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7b7dff85cf6c..a82830f39933 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1946,8 +1946,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		 */
 		if (list_empty(&plug->mq_list))
 			same_queue_rq = NULL;
-		if (same_queue_rq)
+		if (same_queue_rq) {
 			list_del_init(&same_queue_rq->queuelist);
+			plug->rq_count--;
+		}
 		list_add_tail(&rq->queuelist, &plug->mq_list);
 		plug->rq_count++;
 

From 94a2c3a32b62e868dc1e3d854326745a7f1b8c7a Mon Sep 17 00:00:00 2001
From: Yufen Yu <yuyufen@huawei.com>
Date: Wed, 28 Nov 2018 16:42:01 +0800
Subject: [PATCH 164/351] block: use rcu_work instead of call_rcu to avoid
 sleep in softirq

We recently got a stack by syzkaller like this:

BUG: sleeping function called from invalid context at mm/slab.h:361
in_atomic(): 1, irqs_disabled(): 0, pid: 6644, name: blkid
INFO: lockdep is turned off.
CPU: 1 PID: 6644 Comm: blkid Not tainted 4.4.163-514.55.6.9.x86_64+ #76
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
 0000000000000000 5ba6a6b879e50c00 ffff8801f6b07b10 ffffffff81cb2194
 0000000041b58ab3 ffffffff833c7745 ffffffff81cb2080 5ba6a6b879e50c00
 0000000000000000 0000000000000001 0000000000000004 0000000000000000
Call Trace:
 <IRQ>  [<ffffffff81cb2194>] __dump_stack lib/dump_stack.c:15 [inline]
 <IRQ>  [<ffffffff81cb2194>] dump_stack+0x114/0x1a0 lib/dump_stack.c:51
 [<ffffffff8129a981>] ___might_sleep+0x291/0x490 kernel/sched/core.c:7675
 [<ffffffff8129ac33>] __might_sleep+0xb3/0x270 kernel/sched/core.c:7637
 [<ffffffff81794c13>] slab_pre_alloc_hook mm/slab.h:361 [inline]
 [<ffffffff81794c13>] slab_alloc_node mm/slub.c:2610 [inline]
 [<ffffffff81794c13>] slab_alloc mm/slub.c:2692 [inline]
 [<ffffffff81794c13>] kmem_cache_alloc_trace+0x2c3/0x5c0 mm/slub.c:2709
 [<ffffffff81cbe9a7>] kmalloc include/linux/slab.h:479 [inline]
 [<ffffffff81cbe9a7>] kzalloc include/linux/slab.h:623 [inline]
 [<ffffffff81cbe9a7>] kobject_uevent_env+0x2c7/0x1150 lib/kobject_uevent.c:227
 [<ffffffff81cbf84f>] kobject_uevent+0x1f/0x30 lib/kobject_uevent.c:374
 [<ffffffff81cbb5b9>] kobject_cleanup lib/kobject.c:633 [inline]
 [<ffffffff81cbb5b9>] kobject_release+0x229/0x440 lib/kobject.c:675
 [<ffffffff81cbb0a2>] kref_sub include/linux/kref.h:73 [inline]
 [<ffffffff81cbb0a2>] kref_put include/linux/kref.h:98 [inline]
 [<ffffffff81cbb0a2>] kobject_put+0x72/0xd0 lib/kobject.c:692
 [<ffffffff8216f095>] put_device+0x25/0x30 drivers/base/core.c:1237
 [<ffffffff81c4cc34>] delete_partition_rcu_cb+0x1d4/0x2f0 block/partition-generic.c:232
 [<ffffffff813c08bc>] __rcu_reclaim kernel/rcu/rcu.h:118 [inline]
 [<ffffffff813c08bc>] rcu_do_batch kernel/rcu/tree.c:2705 [inline]
 [<ffffffff813c08bc>] invoke_rcu_callbacks kernel/rcu/tree.c:2973 [inline]
 [<ffffffff813c08bc>] __rcu_process_callbacks kernel/rcu/tree.c:2940 [inline]
 [<ffffffff813c08bc>] rcu_process_callbacks+0x59c/0x1c70 kernel/rcu/tree.c:2957
 [<ffffffff8120f509>] __do_softirq+0x299/0xe20 kernel/softirq.c:273
 [<ffffffff81210496>] invoke_softirq kernel/softirq.c:350 [inline]
 [<ffffffff81210496>] irq_exit+0x216/0x2c0 kernel/softirq.c:391
 [<ffffffff82c2cd7b>] exiting_irq arch/x86/include/asm/apic.h:652 [inline]
 [<ffffffff82c2cd7b>] smp_apic_timer_interrupt+0x8b/0xc0 arch/x86/kernel/apic/apic.c:926
 [<ffffffff82c2bc25>] apic_timer_interrupt+0xa5/0xb0 arch/x86/entry/entry_64.S:746
 <EOI>  [<ffffffff814cbf40>] ? audit_kill_trees+0x180/0x180
 [<ffffffff8187d2f7>] fd_install+0x57/0x80 fs/file.c:626
 [<ffffffff8180989e>] do_sys_open+0x45e/0x550 fs/open.c:1043
 [<ffffffff818099c2>] SYSC_open fs/open.c:1055 [inline]
 [<ffffffff818099c2>] SyS_open+0x32/0x40 fs/open.c:1050
 [<ffffffff82c299e1>] entry_SYSCALL_64_fastpath+0x1e/0x9a

In softirq context, we call rcu callback function delete_partition_rcu_cb(),
which may allocate memory by kzalloc with GFP_KERNEL flag. If the
allocation cannot be satisfied, it may sleep. However, That is not allowed
in softirq contex.

Although we found this problem on linux 4.4, the latest kernel version
seems to have this problem as well. And it is very similar to the
previous one:
	https://lkml.org/lkml/2018/7/9/391

Fix it by using RCU workqueue, which allows sleep.

Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com>
Signed-off-by: Yufen Yu <yuyufen@huawei.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/partition-generic.c | 8 +++++---
 include/linux/genhd.h     | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/block/partition-generic.c b/block/partition-generic.c
index d3d14e81fb12..5f8db5c5140f 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -249,9 +249,10 @@ struct device_type part_type = {
 	.uevent		= part_uevent,
 };
 
-static void delete_partition_rcu_cb(struct rcu_head *head)
+static void delete_partition_work_fn(struct work_struct *work)
 {
-	struct hd_struct *part = container_of(head, struct hd_struct, rcu_head);
+	struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct,
+					rcu_work);
 
 	part->start_sect = 0;
 	part->nr_sects = 0;
@@ -262,7 +263,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head)
 void __delete_partition(struct percpu_ref *ref)
 {
 	struct hd_struct *part = container_of(ref, struct hd_struct, ref);
-	call_rcu(&part->rcu_head, delete_partition_rcu_cb);
+	INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn);
+	queue_rcu_work(system_wq, &part->rcu_work);
 }
 
 /*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 70fc838e6773..0c5ee17b4d88 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,7 +129,7 @@ struct hd_struct {
 	struct disk_stats dkstats;
 #endif
 	struct percpu_ref ref;
-	struct rcu_head rcu_head;
+	struct rcu_work rcu_work;
 };
 
 #define GENHD_FL_REMOVABLE			1

From 65cd1d13b880920054d6c750679baa80b7f9c072 Mon Sep 17 00:00:00 2001
From: Weiping Zhang <zhangweiping@didiglobal.com>
Date: Thu, 29 Nov 2018 00:04:39 +0800
Subject: [PATCH 165/351] block: add io timeout to sysfs

Give a interface to adjust io timeout(ms) by device.

Signed-off-by: Weiping Zhang <zhangweiping@didiglobal.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 80eef48fddc8..9f0cb370b39b 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -417,6 +417,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
+{
+	return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout));
+}
+
+static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	unsigned int val;
+	int err;
+
+	err = kstrtou32(page, 10, &val);
+	if (err || val == 0)
+		return -EINVAL;
+
+	blk_queue_rq_timeout(q, msecs_to_jiffies(val));
+
+	return count;
+}
+
 static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
 {
 	if (!wbt_rq_qos(q))
@@ -685,6 +705,12 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_io_timeout_entry = {
+	.attr = {.name = "io_timeout", .mode = 0644 },
+	.show = queue_io_timeout_show,
+	.store = queue_io_timeout_store,
+};
+
 static struct queue_sysfs_entry queue_wb_lat_entry = {
 	.attr = {.name = "wbt_lat_usec", .mode = 0644 },
 	.show = queue_wb_lat_show,
@@ -734,6 +760,7 @@ static struct attribute *default_attrs[] = {
 	&queue_dax_entry.attr,
 	&queue_wb_lat_entry.attr,
 	&queue_poll_delay_entry.attr,
+	&queue_io_timeout_entry.attr,
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 	&throtl_sample_time_entry.attr,
 #endif

From 49379e6d1e9370d1e5dc09ca52aff29ae07c8ba6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Nov 2018 13:55:19 +0300
Subject: [PATCH 166/351] ataflop: fix error handling in atari_floppy_init()

Smatch complains that there is an off by one if the allocation fails in:

	DMABuffer = atari_stram_alloc(BUFFER_SIZE+512, "ataflop");

In that situation, "i" would be point to one element beyond the end of
the unit[] array.

There is a second bug because the error handling calls
blk_mq_free_tag_set(&unit[i].tag_set); regardless of whether
"disk->queue" is NULL or non-NULL.  So if blk_mq_init_sq_queue() fails,
then that means unit[i].tag_set->tags is NULL and it leads to an Oops.

It's easiest to call put_disk() before the goto to clean up the partial
iteration.  Then the earlier unit[] elements are fully allocated so we
can remove the checks whether "disk->queue" is NULL and the code is
simpler.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ataflop.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index f88b4c26d422..313064b1005c 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1982,6 +1982,7 @@ static int __init atari_floppy_init (void)
 							   &ataflop_mq_ops, 2,
 							   BLK_MQ_F_SHOULD_MERGE);
 		if (IS_ERR(unit[i].disk->queue)) {
+			put_disk(unit[i].disk);
 			ret = PTR_ERR(unit[i].disk->queue);
 			unit[i].disk->queue = NULL;
 			goto err;
@@ -2033,18 +2034,13 @@ static int __init atari_floppy_init (void)
 	return 0;
 
 err:
-	do {
+	while (--i >= 0) {
 		struct gendisk *disk = unit[i].disk;
 
-		if (disk) {
-			if (disk->queue) {
-				blk_cleanup_queue(disk->queue);
-				disk->queue = NULL;
-			}
-			blk_mq_free_tag_set(&unit[i].tag_set);
-			put_disk(unit[i].disk);
-		}
-	} while (i--);
+		blk_cleanup_queue(disk->queue);
+		blk_mq_free_tag_set(&unit[i].tag_set);
+		put_disk(unit[i].disk);
+	}
 
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 	return ret;

From 4e6db0f21c99c25980c8d183f95cdb6ad64cebd2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 29 Nov 2018 13:56:54 +0300
Subject: [PATCH 167/351] blk-mq: Add a NULL check in
 blk_mq_free_map_and_requests()

I recently found some code which called blk_mq_free_map_and_requests()
with a NULL set->tags pointer.  I fixed the caller, but it seems like a
good idea to add a NULL check here as well.  Now we can call:

	blk_mq_free_tag_set(set);
	blk_mq_free_tag_set(set);

twice in a row and it's harmless.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a82830f39933..5f4b93f424b4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2341,7 +2341,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 					 unsigned int hctx_idx)
 {
-	if (set->tags[hctx_idx]) {
+	if (set->tags && set->tags[hctx_idx]) {
 		blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
 		blk_mq_free_rq_map(set->tags[hctx_idx]);
 		set->tags[hctx_idx] = NULL;

From ce5b009cff1961137127edf91f44effd0eec8ffd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Nov 2018 17:13:56 -0700
Subject: [PATCH 168/351] block: improve logic around when to sort a plug list

Only do it if we have requests for multiple queues in the same
plug.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       |  1 +
 block/blk-mq.c         | 23 ++++++++++++++++++-----
 include/linux/blkdev.h |  1 +
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index be9233400314..d107d016b92b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1780,6 +1780,7 @@ void blk_start_plug(struct blk_plug *plug)
 	INIT_LIST_HEAD(&plug->mq_list);
 	INIT_LIST_HEAD(&plug->cb_list);
 	plug->rq_count = 0;
+	plug->multiple_queues = false;
 
 	/*
 	 * Store ordering should not be needed here, since a potential
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5f4b93f424b4..2a1a653a8054 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1677,7 +1677,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	list_splice_init(&plug->mq_list, &list);
 	plug->rq_count = 0;
 
-	list_sort(NULL, &list, plug_rq_cmp);
+	if (plug->rq_count > 2 && plug->multiple_queues)
+		list_sort(NULL, &list, plug_rq_cmp);
 
 	this_q = NULL;
 	this_hctx = NULL;
@@ -1866,6 +1867,20 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+	list_add_tail(&rq->queuelist, &plug->mq_list);
+	plug->rq_count++;
+	if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
+		struct request *tmp;
+
+		tmp = list_first_entry(&plug->mq_list, struct request,
+						queuelist);
+		if (tmp->q != rq->q)
+			plug->multiple_queues = true;
+	}
+}
+
 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
@@ -1932,8 +1947,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 			trace_block_plug(q);
 		}
 
-		list_add_tail(&rq->queuelist, &plug->mq_list);
-		plug->rq_count++;
+		blk_add_rq_to_plug(plug, rq);
 	} else if (plug && !blk_queue_nomerges(q)) {
 		blk_mq_bio_to_request(rq, bio);
 
@@ -1950,8 +1964,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 			list_del_init(&same_queue_rq->queuelist);
 			plug->rq_count--;
 		}
-		list_add_tail(&rq->queuelist, &plug->mq_list);
-		plug->rq_count++;
+		blk_add_rq_to_plug(plug, rq);
 
 		blk_mq_put_ctx(data.ctx);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 02732cae6080..08d940f85fa0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1131,6 +1131,7 @@ struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
 	struct list_head cb_list; /* md requires an unplug callback */
 	unsigned short rq_count;
+	bool multiple_queues;
 };
 #define BLK_MAX_REQUEST_COUNT 16
 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)

From d666ba98f849ad44c4405ecc2180390ebe80f4f9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Nov 2018 17:02:25 -0700
Subject: [PATCH 169/351] blk-mq: add mq_ops->commit_rqs()

blk-mq passes information to the hardware about any given request being
the last that we will issue in this sequence. The point is that hardware
can defer costly doorbell type writes to the last request. But if we run
into errors issuing a sequence of requests, we may never send the request
with bd->last == true set. For that case, we need a hook that tells the
hardware that nothing else is coming right now.

For failures returned by the drivers ->queue_rq() hook, the driver is
responsible for flushing pending requests, if it uses bd->last to
optimize that part. This works like before, no changes there.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 16 ++++++++++++++++
 include/linux/blk-mq.h | 10 ++++++++++
 2 files changed, 26 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2a1a653a8054..d8534107bb6f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1259,6 +1259,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 	if (!list_empty(list)) {
 		bool needs_restart;
 
+		/*
+		 * If we didn't flush the entire list, we could have told
+		 * the driver there was more coming, but that turned out to
+		 * be a lie.
+		 */
+		if (q->mq_ops->commit_rqs)
+			q->mq_ops->commit_rqs(hctx);
+
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
@@ -1865,6 +1873,14 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 			blk_mq_end_request(rq, ret);
 		}
 	}
+
+	/*
+	 * If we didn't flush the entire list, we could have told
+	 * the driver there was more coming, but that turned out to
+	 * be a lie.
+	 */
+	if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
+		hctx->queue->mq_ops->commit_rqs(hctx);
 }
 
 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b8de11e0603b..467f1dd21ccf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -117,6 +117,7 @@ struct blk_mq_queue_data {
 
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
+typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
 /* takes rq->cmd_flags as input, returns a hardware type index */
 typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
@@ -144,6 +145,15 @@ struct blk_mq_ops {
 	 */
 	queue_rq_fn		*queue_rq;
 
+	/*
+	 * If a driver uses bd->last to judge when to submit requests to
+	 * hardware, it must define this function. In case of errors that
+	 * make us stop issuing further requests, this hook serves the
+	 * purpose of kicking the hardware (which the last request otherwise
+	 * would have done).
+	 */
+	commit_rqs_fn		*commit_rqs;
+
 	/*
 	 * Return a queue map type for the given request/bio flags
 	 */

From 04f3eafda6e05adc56afed4d3ae6e24aaa429058 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 10:02:29 -0700
Subject: [PATCH 170/351] nvme: implement mq_ops->commit_rqs() hook

Split the command submission and the SQ doorbell ring, and add the
doorbell ring as our ->commit_rqs() hook. This allows a list of
requests to be issued, with nvme only writing the SQ update when
it's necessary. This is more efficient if we have lists of requests
to issue, particularly on virtualized hardware, where writing the
SQ doorbell is more expensive than on real hardware. For those cases,
performance increases of 2-3x have been observed.

The use case for this is plugged IO, where blk-mq flushes a batch of
requests at the time.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 47 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 73effe586e5f..527907aa6903 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -203,6 +203,7 @@ struct nvme_queue {
 	u16 q_depth;
 	s16 cq_vector;
 	u16 sq_tail;
+	u16 last_sq_tail;
 	u16 cq_head;
 	u16 last_cq_head;
 	u16 qid;
@@ -522,22 +523,50 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+/*
+ * Write sq tail if we are asked to, or if the next command would wrap.
+ */
+static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
+{
+	if (!write_sq) {
+		u16 next_tail = nvmeq->sq_tail + 1;
+
+		if (next_tail == nvmeq->q_depth)
+			next_tail = 0;
+		if (next_tail != nvmeq->last_sq_tail)
+			return;
+	}
+
+	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
+			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
+		writel(nvmeq->sq_tail, nvmeq->q_db);
+	nvmeq->last_sq_tail = nvmeq->sq_tail;
+}
+
 /**
  * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
  * @nvmeq: The queue to use
  * @cmd: The command to send
+ * @write_sq: whether to write to the SQ doorbell
  */
-static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+			    bool write_sq)
 {
 	spin_lock(&nvmeq->sq_lock);
-
 	memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
-
 	if (++nvmeq->sq_tail == nvmeq->q_depth)
 		nvmeq->sq_tail = 0;
-	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
-			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
-		writel(nvmeq->sq_tail, nvmeq->q_db);
+	nvme_write_sq_db(nvmeq, write_sq);
+	spin_unlock(&nvmeq->sq_lock);
+}
+
+static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct nvme_queue *nvmeq = hctx->driver_data;
+
+	spin_lock(&nvmeq->sq_lock);
+	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
+		nvme_write_sq_db(nvmeq, true);
 	spin_unlock(&nvmeq->sq_lock);
 }
 
@@ -923,7 +952,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	}
 
 	blk_mq_start_request(req);
-	nvme_submit_cmd(nvmeq, &cmnd);
+	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
 	return BLK_STS_OK;
 out_cleanup_iod:
 	nvme_free_iod(dev, req);
@@ -1108,7 +1137,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
-	nvme_submit_cmd(nvmeq, &c);
+	nvme_submit_cmd(nvmeq, &c, true);
 }
 
 static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
@@ -1531,6 +1560,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 
 	spin_lock_irq(&nvmeq->cq_lock);
 	nvmeq->sq_tail = 0;
+	nvmeq->last_sq_tail = 0;
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1603,6 +1633,7 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 #define NVME_SHARED_MQ_OPS					\
 	.queue_rq		= nvme_queue_rq,		\
+	.commit_rqs		= nvme_commit_rqs,		\
 	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
 	.complete		= nvme_pci_complete_rq,		\
 	.init_hctx		= nvme_init_hctx,		\

From 944e7c87967c820a0f34a935b1f2799944099750 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 26 Nov 2018 11:00:12 -0700
Subject: [PATCH 171/351] virtio_blk: implement mq_ops->commit_rqs() hook

We need this for blk-mq to kick things into gear, if we told it that
we had more IO coming, but then failed to deliver on that promise.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/virtio_blk.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6e869d05f91e..912c4265e592 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -214,6 +214,20 @@ static void virtblk_done(struct virtqueue *vq)
 	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
 }
 
+static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
+	bool kick;
+
+	spin_lock_irq(&vq->lock);
+	kick = virtqueue_kick_prepare(vq->vq);
+	spin_unlock_irq(&vq->lock);
+
+	if (kick)
+		virtqueue_notify(vq->vq);
+}
+
 static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
 			   const struct blk_mq_queue_data *bd)
 {
@@ -638,6 +652,7 @@ static void virtblk_initialize_rq(struct request *req)
 
 static const struct blk_mq_ops virtio_mq_ops = {
 	.queue_rq	= virtio_queue_rq,
+	.commit_rqs	= virtio_commit_rqs,
 	.complete	= virtblk_request_done,
 	.init_request	= virtblk_init_request,
 #ifdef CONFIG_VIRTIO_BLK_SCSI

From 80ff2040ac3dd6d9d68d59159b0a6dffd8adc5ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Nov 2018 06:09:07 -0700
Subject: [PATCH 172/351] ataflop: implement mq_ops->commit_rqs() hook

We need this for blk-mq to kick things into gear, if we told it that
we had more IO coming, but then failed to deliver on that promise.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/ataflop.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 313064b1005c..b0dbbdfeb33e 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1471,6 +1471,15 @@ static void setup_req_params( int drive )
 			ReqTrack, ReqSector, (unsigned long)ReqData ));
 }
 
+static void ataflop_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	spin_lock_irq(&ataflop_lock);
+	atari_disable_irq(IRQ_MFP_FDC);
+	finish_fdc();
+	atari_enable_irq(IRQ_MFP_FDC);
+	spin_unlock_irq(&ataflop_lock);
+}
+
 static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
 				     const struct blk_mq_queue_data *bd)
 {
@@ -1947,6 +1956,7 @@ static const struct block_device_operations floppy_fops = {
 
 static const struct blk_mq_ops ataflop_mq_ops = {
 	.queue_rq = ataflop_queue_rq,
+	.commit_rqs = ataflop_commit_rqs,
 };
 
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)

From be94f058f2bde6f0b0ee9059a35daa8e15be308f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 24 Nov 2018 10:15:46 -0700
Subject: [PATCH 173/351] blk-mq: use bd->last == true for list inserts

If we are issuing a list of requests, we know if we're at the last one.
If we fail issuing, ensure that we call ->commits_rqs() to flush any
potential previous requests.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c |  2 +-
 block/blk-mq.c   | 16 ++++++++--------
 block/blk-mq.h   |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d107d016b92b..3f6f5e6c2fe4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1334,7 +1334,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	 * bypass a potential scheduler on the bottom device for
 	 * insert.
 	 */
-	return blk_mq_request_issue_directly(rq);
+	return blk_mq_request_issue_directly(rq, true);
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d8534107bb6f..14e31e93a950 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1743,12 +1743,12 @@ static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq)
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 					    struct request *rq,
-					    blk_qc_t *cookie)
+					    blk_qc_t *cookie, bool last)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
-		.last = true,
+		.last = last,
 	};
 	blk_qc_t new_cookie;
 	blk_status_t ret;
@@ -1783,7 +1783,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
 						blk_qc_t *cookie,
-						bool bypass_insert)
+						bool bypass_insert, bool last)
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
@@ -1812,7 +1812,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	return __blk_mq_issue_directly(hctx, rq, cookie);
+	return __blk_mq_issue_directly(hctx, rq, cookie, last);
 insert:
 	if (bypass_insert)
 		return BLK_STS_RESOURCE;
@@ -1831,7 +1831,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 	hctx_lock(hctx, &srcu_idx);
 
-	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false);
+	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_sched_insert_request(rq, false, true, false);
 	else if (ret != BLK_STS_OK)
@@ -1840,7 +1840,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	hctx_unlock(hctx, srcu_idx);
 }
 
-blk_status_t blk_mq_request_issue_directly(struct request *rq)
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
 	blk_status_t ret;
 	int srcu_idx;
@@ -1848,7 +1848,7 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq)
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 
 	hctx_lock(hctx, &srcu_idx);
-	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true);
+	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
 	hctx_unlock(hctx, srcu_idx);
 
 	return ret;
@@ -1863,7 +1863,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				queuelist);
 
 		list_del_init(&rq->queuelist);
-		ret = blk_mq_request_issue_directly(rq);
+		ret = blk_mq_request_issue_directly(rq, list_empty(list));
 		if (ret != BLK_STS_OK) {
 			if (ret == BLK_STS_RESOURCE ||
 					ret == BLK_STS_DEV_RESOURCE) {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9ae8e9f8f8b1..7291e5379358 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -69,7 +69,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
 /* Used by blk_insert_cloned_request() to issue request directly */
-blk_status_t blk_mq_request_issue_directly(struct request *rq);
+blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				    struct list_head *list);
 

From b2c5d16b72df1116f05c9be16a630ac939d34101 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 10:03:42 -0700
Subject: [PATCH 174/351] blk-mq: use plug for devices that implement
 ->commits_rqs()

If we have that hook, we know the driver handles bd->last == true in
a smart fashion. If it does, even for multiple hardware queues, it's
a good idea to flush batches of requests to the device, if we have
batches of requests from the submitter.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 14e31e93a950..7dcef565dc0f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1945,7 +1945,11 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		/* bypass scheduler for flush rq */
 		blk_insert_flush(rq);
 		blk_mq_run_hw_queue(data.hctx, true);
-	} else if (plug && q->nr_hw_queues == 1) {
+	} else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+		/*
+		 * Use plugging if we have a ->commit_rqs() hook as well, as
+		 * we know the driver uses bd->last in a smart fashion.
+		 */
 		unsigned int request_count = plug->rq_count;
 		struct request *last = NULL;
 

From 27fae429acee1e9418059e7fa545438075af5256 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 12:35:16 -0700
Subject: [PATCH 175/351] sbitmap: don't loop for find_next_zero_bit() for
 !round_robin

If we aren't forced to do round robin tag allocation, just use the
allocation hint to find the index for the tag word, don't use it for the
offset inside the word. This avoids a potential extra round trip in the
bit looping, and since we're fetching this cacheline, we may as well
check the whole word from the start.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/sbitmap.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index fdd1b8aa8ac6..45cab6bbc1c7 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -118,10 +118,19 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
+	/*
+	 * Unless we're doing round robin tag allocation, just use the
+	 * alloc_hint to find the right word index. No point in looping
+	 * twice in find_next_zero_bit() for that case.
+	 */
+	if (round_robin)
+		alloc_hint = SB_NR_TO_BIT(sb, alloc_hint);
+	else
+		alloc_hint = 0;
+
 	for (i = 0; i < sb->map_nr; i++) {
 		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth,
-					SB_NR_TO_BIT(sb, alloc_hint),
+					sb->map[index].depth, alloc_hint,
 					!round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
@@ -129,13 +138,9 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		}
 
 		/* Jump to next index. */
-		index++;
-		alloc_hint = index << sb->shift;
-
-		if (index >= sb->map_nr) {
+		alloc_hint = 0;
+		if (++index >= sb->map_nr)
 			index = 0;
-			alloc_hint = 0;
-		}
 	}
 
 	return nr;

From 531724abc3bfb556c1dd68086cf9cb51f76464e3 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 30 Nov 2018 09:23:48 +0100
Subject: [PATCH 176/351] block: avoid extra bio reference for async O_DIRECT

The bio referencing has a trick that doesn't do any actual atomic
inc/dec on the reference count until we have to elevator to > 1. For the
async IO O_DIRECT case, we can't use the simple DIO variants, so we use
__blkdev_direct_IO(). It always grabs an extra reference to the bio
after allocation, which means we then enter the slower path of actually
having to do atomic_inc/dec on the count.

We don't need to do that for the async case, unless we end up going
multi-bio, in which case we're already doing huge amounts of IO. For the
smaller IO case (< BIO_MAX_PAGES), we can do without the extra ref.

Based on an earlier patch (and commit log) from Jens Axboe.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d233a59ea364..e1886cc7048f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -302,7 +302,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 			}
 
 			dio->iocb->ki_complete(iocb, ret, 0);
-			bio_put(&dio->bio);
+			if (dio->multi_bio)
+				bio_put(&dio->bio);
 		} else {
 			struct task_struct *waiter = dio->waiter;
 
@@ -343,14 +344,15 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		return -EINVAL;
 
 	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool);
-	bio_get(bio); /* extra ref for the completion handler */
 
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->is_sync = is_sync = is_sync_kiocb(iocb);
-	if (dio->is_sync)
+	if (dio->is_sync) {
 		dio->waiter = current;
-	else
+		bio_get(bio);
+	} else {
 		dio->iocb = iocb;
+	}
 
 	dio->size = 0;
 	dio->multi_bio = false;
@@ -400,6 +402,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 		}
 
 		if (!dio->multi_bio) {
+			/*
+			 * AIO needs an extra reference to ensure the dio
+			 * structure which is embedded into the first bio
+			 * stays around.
+			 */
+			if (!is_sync)
+				bio_get(bio);
 			dio->multi_bio = true;
 			atomic_set(&dio->ref, 2);
 		} else {

From ea86ea2cdced20057da4d2c32965c1219c238197 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 30 Nov 2018 13:18:06 -0700
Subject: [PATCH 177/351] sbitmap: ammortize cost of clearing bits

sbitmap maintains a set of words that we use to set and clear bits, with
each bit representing a tag for blk-mq. Even though we spread the bits
out and maintain a hint cache, one particular bit allocated will end up
being cleared in the exact same spot.

This introduces batched clearing of bits. Instead of clearing a given
bit, the same bit is set in a cleared/free mask instead. If we fail
allocating a bit from a given word, then we check the free mask, and
batch move those cleared bits at that time. This trades 64 atomic bitops
for 2 cmpxchg().

In a threaded poll test case, half the overhead of getting and clearing
tags is removed with this change. On another poll test case with a
single thread, performance is unchanged.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 37 +++++++++++++++----
 lib/sbitmap.c           | 81 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 102 insertions(+), 16 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 804a50983ec5..81359d45751e 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -30,14 +30,24 @@ struct seq_file;
  */
 struct sbitmap_word {
 	/**
-	 * @word: The bitmap word itself.
-	 */
-	unsigned long word;
-
-	/**
-	 * @depth: Number of bits being used in @word.
+	 * @depth: Number of bits being used in @word/@cleared
 	 */
 	unsigned long depth;
+
+	/**
+	 * @word: word holding free bits
+	 */
+	unsigned long word ____cacheline_aligned_in_smp;
+
+	/**
+	 * @cleared: word holding cleared bits
+	 */
+	unsigned long cleared ____cacheline_aligned_in_smp;
+
+	/**
+	 * @swap_lock: Held while swapping word <-> cleared
+	 */
+	spinlock_t swap_lock;
 } ____cacheline_aligned_in_smp;
 
 /**
@@ -310,6 +320,19 @@ static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
 	clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
+/*
+ * This one is special, since it doesn't actually clear the bit, rather it
+ * sets the corresponding bit in the ->cleared mask instead. Paired with
+ * the caller doing sbitmap_batch_clear() if a given index is full, which
+ * will clear the previously freed entries in the corresponding ->word.
+ */
+static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
+{
+	unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;
+
+	set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
+}
+
 static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb,
 					    unsigned int bitnr)
 {
@@ -321,8 +344,6 @@ static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
 	return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
 }
 
-unsigned int sbitmap_weight(const struct sbitmap *sb);
-
 /**
  * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
  * @sb: Bitmap to show.
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 45cab6bbc1c7..f99382e59314 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -59,6 +59,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 	for (i = 0; i < sb->map_nr; i++) {
 		sb->map[i].depth = min(depth, bits_per_word);
 		depth -= sb->map[i].depth;
+		spin_lock_init(&sb->map[i].swap_lock);
 	}
 	return 0;
 }
@@ -111,6 +112,57 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+	bool ret = false;
+
+	spin_lock(&sb->map[index].swap_lock);
+
+	if (!sb->map[index].cleared)
+		goto out_unlock;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	ret = true;
+out_unlock:
+	spin_unlock(&sb->map[index].swap_lock);
+	return ret;
+}
+
+static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
+				     unsigned int alloc_hint, bool round_robin)
+{
+	int nr;
+
+	do {
+		nr = __sbitmap_get_word(&sb->map[index].word,
+					sb->map[index].depth, alloc_hint,
+					!round_robin);
+		if (nr != -1)
+			break;
+		if (!sbitmap_deferred_clear(sb, index))
+			break;
+	} while (1);
+
+	return nr;
+}
+
 int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 {
 	unsigned int i, index;
@@ -129,9 +181,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin)
 		alloc_hint = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		nr = __sbitmap_get_word(&sb->map[index].word,
-					sb->map[index].depth, alloc_hint,
-					!round_robin);
+		nr = sbitmap_find_bit_in_index(sb, index, alloc_hint,
+						round_robin);
 		if (nr != -1) {
 			nr += index << sb->shift;
 			break;
@@ -206,23 +257,36 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
 }
 EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear);
 
-unsigned int sbitmap_weight(const struct sbitmap *sb)
+static unsigned int __sbitmap_weight(const struct sbitmap *sb, bool set)
 {
 	unsigned int i, weight = 0;
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
 
-		weight += bitmap_weight(&word->word, word->depth);
+		if (set)
+			weight += bitmap_weight(&word->word, word->depth);
+		else
+			weight += bitmap_weight(&word->cleared, word->depth);
 	}
 	return weight;
 }
-EXPORT_SYMBOL_GPL(sbitmap_weight);
+
+static unsigned int sbitmap_weight(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, true);
+}
+
+static unsigned int sbitmap_cleared(const struct sbitmap *sb)
+{
+	return __sbitmap_weight(sb, false);
+}
 
 void sbitmap_show(struct sbitmap *sb, struct seq_file *m)
 {
 	seq_printf(m, "depth=%u\n", sb->depth);
-	seq_printf(m, "busy=%u\n", sbitmap_weight(sb));
+	seq_printf(m, "busy=%u\n", sbitmap_weight(sb) - sbitmap_cleared(sb));
+	seq_printf(m, "cleared=%u\n", sbitmap_cleared(sb));
 	seq_printf(m, "bits_per_word=%u\n", 1U << sb->shift);
 	seq_printf(m, "map_nr=%u\n", sb->map_nr);
 }
@@ -514,7 +578,8 @@ EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
 void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
 			 unsigned int cpu)
 {
-	sbitmap_clear_bit_unlock(&sbq->sb, nr);
+	sbitmap_deferred_clear_bit(&sbq->sb, nr);
+
 	/*
 	 * Pairs with the memory barrier in set_current_state() to ensure the
 	 * proper ordering of clear_bit_unlock()/waitqueue_active() in the waker

From 5d2ee7122c73be6a3b6bfe90d237e8aed737cfaa Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 29 Nov 2018 17:36:41 -0700
Subject: [PATCH 178/351] sbitmap: optimize wakeup check

Even if we have no waiters on any of the sbitmap_queue wait states, we
still have to loop every entry to check. We do this for every IO, so
the cost adds up.

Shift a bit of the cost to the slow path, when we actually have waiters.
Wrap prepare_to_wait_exclusive() and finish_wait(), so we can maintain
an internal count of how many are currently active. Then we can simply
check this count in sbq_wake_ptr() and not have to loop if we don't
have any sleepers.

Convert the two users of sbitmap with waiting, blk-mq-tag and iSCSI.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-tag.c                       | 11 ++++----
 drivers/target/iscsi/iscsi_target_util.c | 12 +++++----
 include/linux/sbitmap.h                  | 34 ++++++++++++++++++++++++
 lib/sbitmap.c                            | 28 +++++++++++++++++++
 4 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 87bc5df72d48..2089c6c62f44 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt;
 	struct sbq_wait_state *ws;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	unsigned int tag_offset;
 	bool drop_ctx;
 	int tag;
@@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
-		prepare_to_wait_exclusive(&ws->wait, &wait,
-						TASK_UNINTERRUPTIBLE);
+		sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 
 		tag = __blk_mq_get_tag(data, bt);
 		if (tag != -1)
@@ -167,6 +166,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		bt_prev = bt;
 		io_schedule();
 
+		sbitmap_finish_wait(bt, ws, &wait);
+
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx->cpu);
@@ -176,8 +177,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		else
 			bt = &tags->bitmap_tags;
 
-		finish_wait(&ws->wait, &wait);
-
 		/*
 		 * If destination hw queue is changed, fake wake up on
 		 * previous queue for compensating the wake up miss, so
@@ -192,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	if (drop_ctx && data->ctx)
 		blk_mq_put_ctx(data->ctx);
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
 	return tag + tag_offset;
diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c
index 36b742932c72..86987da86dd6 100644
--- a/drivers/target/iscsi/iscsi_target_util.c
+++ b/drivers/target/iscsi/iscsi_target_util.c
@@ -150,24 +150,26 @@ void iscsit_free_r2ts_from_list(struct iscsi_cmd *cmd)
 static int iscsit_wait_for_tag(struct se_session *se_sess, int state, int *cpup)
 {
 	int tag = -1;
-	DEFINE_WAIT(wait);
+	DEFINE_SBQ_WAIT(wait);
 	struct sbq_wait_state *ws;
+	struct sbitmap_queue *sbq;
 
 	if (state == TASK_RUNNING)
 		return tag;
 
-	ws = &se_sess->sess_tag_pool.ws[0];
+	sbq = &se_sess->sess_tag_pool;
+	ws = &sbq->ws[0];
 	for (;;) {
-		prepare_to_wait_exclusive(&ws->wait, &wait, state);
+		sbitmap_prepare_to_wait(sbq, ws, &wait, state);
 		if (signal_pending_state(state, current))
 			break;
-		tag = sbitmap_queue_get(&se_sess->sess_tag_pool, cpup);
+		tag = sbitmap_queue_get(sbq, cpup);
 		if (tag >= 0)
 			break;
 		schedule();
 	}
 
-	finish_wait(&ws->wait, &wait);
+	sbitmap_finish_wait(sbq, ws, &wait);
 	return tag;
 }
 
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 81359d45751e..92806a2dbab7 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -135,6 +135,11 @@ struct sbitmap_queue {
 	 */
 	struct sbq_wait_state *ws;
 
+	/*
+	 * @ws_active: count of currently active ws waitqueues
+	 */
+	atomic_t ws_active;
+
 	/**
 	 * @round_robin: Allocate bits in strict round-robin order.
 	 */
@@ -552,4 +557,33 @@ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq);
  */
 void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);
 
+struct sbq_wait {
+	int accounted;
+	struct wait_queue_entry wait;
+};
+
+#define DEFINE_SBQ_WAIT(name)							\
+	struct sbq_wait name = {						\
+		.accounted = 0,							\
+		.wait = {							\
+			.private	= current,				\
+			.func		= autoremove_wake_function,		\
+			.entry		= LIST_HEAD_INIT((name).wait.entry),	\
+		}								\
+	}
+
+/*
+ * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
+ * internal state.
+ */
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+				struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait, int state);
+
+/*
+ * Must be paired with sbitmap_prepare_to_wait().
+ */
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+				struct sbq_wait *sbq_wait);
+
 #endif /* __LINUX_SCALE_BITMAP_H */
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index f99382e59314..a89fbe7cf6ca 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -394,6 +394,7 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
 	sbq->min_shallow_depth = UINT_MAX;
 	sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
 	atomic_set(&sbq->wake_index, 0);
+	atomic_set(&sbq->ws_active, 0);
 
 	sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
 	if (!sbq->ws) {
@@ -509,6 +510,9 @@ static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
 {
 	int i, wake_index;
 
+	if (!atomic_read(&sbq->ws_active))
+		return NULL;
+
 	wake_index = atomic_read(&sbq->wake_index);
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
 		struct sbq_wait_state *ws = &sbq->ws[wake_index];
@@ -634,6 +638,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 
 	seq_printf(m, "wake_batch=%u\n", sbq->wake_batch);
 	seq_printf(m, "wake_index=%d\n", atomic_read(&sbq->wake_index));
+	seq_printf(m, "ws_active=%d\n", atomic_read(&sbq->ws_active));
 
 	seq_puts(m, "ws={\n");
 	for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
@@ -649,3 +654,26 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
 	seq_printf(m, "min_shallow_depth=%u\n", sbq->min_shallow_depth);
 }
 EXPORT_SYMBOL_GPL(sbitmap_queue_show);
+
+void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
+			     struct sbq_wait_state *ws,
+			     struct sbq_wait *sbq_wait, int state)
+{
+	if (!sbq_wait->accounted) {
+		atomic_inc(&sbq->ws_active);
+		sbq_wait->accounted = 1;
+	}
+	prepare_to_wait_exclusive(&ws->wait, &sbq_wait->wait, state);
+}
+EXPORT_SYMBOL_GPL(sbitmap_prepare_to_wait);
+
+void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
+			 struct sbq_wait *sbq_wait)
+{
+	finish_wait(&ws->wait, &sbq_wait->wait);
+	if (sbq_wait->accounted) {
+		atomic_dec(&sbq->ws_active);
+		sbq_wait->accounted = 0;
+	}
+}
+EXPORT_SYMBOL_GPL(sbitmap_finish_wait);

From 2149da0748fc236b9916c53e26b3b0c9ab20a5dd Mon Sep 17 00:00:00 2001
From: Balbir Singh <sblbir@amzn.com>
Date: Tue, 30 Oct 2018 02:40:15 +0000
Subject: [PATCH 179/351] block: add cmd_flags to print_req_error

I ran into a bug where after hibernation due to incompatible
backends, the block driver returned BLK_STS_NOTSUPP, with the
current message it's hard to find out what the command flags
were. Adding req->cmd_flags help make the problem easier to
diagnose.

Reviewed-by: Eduardo Valentin <eduval@amazon.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Balbir Singh <sblbir@amzn.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3f6f5e6c2fe4..a1a5e1c14898 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -173,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status)
 	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
 		return;
 
-	printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n",
-			   __func__, blk_errors[idx].name, req->rq_disk ?
-			   req->rq_disk->disk_name : "?",
-			   (unsigned long long)blk_rq_pos(req));
+	printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
+				__func__, blk_errors[idx].name,
+				req->rq_disk ?  req->rq_disk->disk_name : "?",
+				(unsigned long long)blk_rq_pos(req),
+				req->cmd_flags);
 }
 
 static void req_bio_endio(struct request *rq, struct bio *bio,

From fe1f452640d888c4311ef35c8ee497f47c3ea17c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Nov 2018 10:50:07 -0700
Subject: [PATCH 180/351] blk-mq: don't call ktime_get_ns() if we don't need it

We only need the request fields and the end_io time if we have
stats enabled, or if we have a scheduler attached as those may
use it for completion time stats.

Reviewed-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7dcef565dc0f..e09d7f500077 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -281,6 +281,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
+/*
+ * Only need start/end time stamping if we have stats enabled, or using
+ * an IO scheduler.
+ */
+static inline bool blk_mq_need_time_stamp(struct request *rq)
+{
+	return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
+}
+
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 		unsigned int tag, unsigned int op)
 {
@@ -316,7 +325,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
-	rq->start_time_ns = ktime_get_ns();
+	if (blk_mq_need_time_stamp(rq))
+		rq->start_time_ns = ktime_get_ns();
+	else
+		rq->start_time_ns = 0;
 	rq->io_start_time_ns = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -522,7 +534,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
-	u64 now = ktime_get_ns();
+	u64 now = 0;
+
+	if (blk_mq_need_time_stamp(rq))
+		now = ktime_get_ns();
 
 	if (rq->rq_flags & RQF_STATS) {
 		blk_mq_poll_stats_start(rq->q);

From 8c2def893afc60d88160d524acf345765cf0c447 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Mon, 3 Dec 2018 14:45:43 -0800
Subject: [PATCH 181/351] sbitmap: fix sbitmap_for_each_set()

We need to ignore bits in the cleared mask when iterating over all set
bits.

Fixes: ea86ea2cdced ("sbitmap: ammortize cost of clearing bits")
Reported-by: Jens Axboe@kernel.dk>
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sbitmap.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 92806a2dbab7..03f50fcedc79 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -265,12 +265,14 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 	nr = SB_NR_TO_BIT(sb, start);
 
 	while (scanned < sb->depth) {
-		struct sbitmap_word *word = &sb->map[index];
-		unsigned int depth = min_t(unsigned int, word->depth - nr,
+		unsigned long word;
+		unsigned int depth = min_t(unsigned int,
+					   sb->map[index].depth - nr,
 					   sb->depth - scanned);
 
 		scanned += depth;
-		if (!word->word)
+		word = sb->map[index].word & ~sb->map[index].cleared;
+		if (!word)
 			goto next;
 
 		/*
@@ -280,7 +282,7 @@ static inline void __sbitmap_for_each_set(struct sbitmap *sb,
 		 */
 		depth += nr;
 		while (1) {
-			nr = find_next_bit(&word->word, depth, nr);
+			nr = find_next_bit(&word, depth, nr);
 			if (nr >= depth)
 				break;
 			if (!fn(sb, (index << sb->shift) + nr, data))

From 154989e45fd8de9bfb52bbd6e5ea763e437e54c5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 22 Nov 2018 16:44:07 +0100
Subject: [PATCH 182/351] aio: clear IOCB_HIPRI

No one is going to poll for aio (yet), so we must clear the HIPRI
flag, as we would otherwise send it down the poll queues, where no
one will be polling for completions.

Signed-off-by: Christoph Hellwig <hch@lst.de>

IOCB_HIPRI, not RWF_HIPRI.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 205390c0c1bb..05647d352bf3 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1436,8 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 		ret = ioprio_check_cap(iocb->aio_reqprio);
 		if (ret) {
 			pr_debug("aio ioprio check cap error: %d\n", ret);
-			fput(req->ki_filp);
-			return ret;
+			goto out_fput;
 		}
 
 		req->ki_ioprio = iocb->aio_reqprio;
@@ -1446,7 +1445,13 @@ static int aio_prep_rw(struct kiocb *req, struct iocb *iocb)
 
 	ret = kiocb_set_rw_flags(req, iocb->aio_rw_flags);
 	if (unlikely(ret))
-		fput(req->ki_filp);
+		goto out_fput;
+
+	req->ki_flags &= ~IOCB_HIPRI; /* no one is going to poll for this I/O */
+	return 0;
+
+out_fput:
+	fput(req->ki_filp);
 	return ret;
 }
 

From e20ba6e1da029136ded295f33076483d65ddf50a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:16 +0100
Subject: [PATCH 183/351] block: move queues types to the block layer

Having another indirect all in the fast path doesn't really help
in our post-spectre world.  Also having too many queue type is just
going to create confusion, so I'd rather manage them centrally.

Note that the queue type naming and ordering changes a bit - the
first index now is the default queue for everything not explicitly
marked, the optional ones are read and poll queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sysfs.c    |  9 +++++-
 block/blk-mq.h          | 21 +++++++------
 drivers/nvme/host/pci.c | 68 +++++++++++++++--------------------------
 include/linux/blk-mq.h  | 15 ++++-----
 4 files changed, 51 insertions(+), 62 deletions(-)

diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 6efef1f679f0..9c2df137256a 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -173,9 +173,16 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
+static const char *const hctx_types[] = {
+	[HCTX_TYPE_DEFAULT]	= "default",
+	[HCTX_TYPE_READ]	= "read",
+	[HCTX_TYPE_POLL]	= "poll",
+};
+
 static ssize_t blk_mq_hw_sysfs_type_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
-	return sprintf(page, "%u\n", hctx->type);
+	BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
+	return sprintf(page, "%s\n", hctx_types[hctx->type]);
 }
 
 static struct attribute *default_ctx_attrs[] = {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7291e5379358..a664ea44ffd4 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -81,16 +81,14 @@ extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int);
 /*
  * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue
  * @q: request queue
- * @hctx_type: the hctx type index
+ * @type: the hctx type index
  * @cpu: CPU
  */
 static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q,
-							  unsigned int hctx_type,
+							  enum hctx_type type,
 							  unsigned int cpu)
 {
-	struct blk_mq_tag_set *set = q->tag_set;
-
-	return q->queue_hw_ctx[set->map[hctx_type].mq_map[cpu]];
+	return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
 }
 
 /*
@@ -103,12 +101,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
 						     unsigned int cpu)
 {
-	int hctx_type = 0;
+	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
-	if (q->mq_ops->rq_flags_to_type)
-		hctx_type = q->mq_ops->rq_flags_to_type(q, flags);
+	if (q->tag_set->nr_maps > HCTX_TYPE_POLL &&
+	    ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags)))
+		type = HCTX_TYPE_POLL;
 
-	return blk_mq_map_queue_type(q, hctx_type, cpu);
+	else if (q->tag_set->nr_maps > HCTX_TYPE_READ &&
+		 ((flags & REQ_OP_MASK) == REQ_OP_READ))
+		type = HCTX_TYPE_READ;
+
+	return blk_mq_map_queue_type(q, type, cpu);
 }
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 527907aa6903..a1bb4bb92e7f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -95,13 +95,6 @@ struct nvme_queue;
 
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
 
-enum {
-	NVMEQ_TYPE_READ,
-	NVMEQ_TYPE_WRITE,
-	NVMEQ_TYPE_POLL,
-	NVMEQ_TYPE_NR,
-};
-
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
@@ -115,7 +108,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
-	unsigned io_queues[NVMEQ_TYPE_NR];
+	unsigned io_queues[HCTX_MAX_TYPES];
 	unsigned int num_vecs;
 	int q_depth;
 	u32 db_stride;
@@ -499,10 +492,10 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 
 		map->nr_queues = dev->io_queues[i];
 		if (!map->nr_queues) {
-			BUG_ON(i == NVMEQ_TYPE_READ);
+			BUG_ON(i == HCTX_TYPE_DEFAULT);
 
 			/* shared set, resuse read set parameters */
-			map->nr_queues = dev->io_queues[NVMEQ_TYPE_READ];
+			map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
 			qoff = 0;
 			offset = queue_irq_offset(dev);
 		}
@@ -512,7 +505,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != NVMEQ_TYPE_POLL)
+		if (i != HCTX_TYPE_POLL)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);
@@ -961,16 +954,6 @@ out_free_cmd:
 	return ret;
 }
 
-static int nvme_rq_flags_to_type(struct request_queue *q, unsigned int flags)
-{
-	if ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
-		return NVMEQ_TYPE_POLL;
-	if ((flags & REQ_OP_MASK) == REQ_OP_READ)
-		return NVMEQ_TYPE_READ;
-
-	return NVMEQ_TYPE_WRITE;
-}
-
 static void nvme_pci_complete_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1634,7 +1617,6 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 #define NVME_SHARED_MQ_OPS					\
 	.queue_rq		= nvme_queue_rq,		\
 	.commit_rqs		= nvme_commit_rqs,		\
-	.rq_flags_to_type	= nvme_rq_flags_to_type,	\
 	.complete		= nvme_pci_complete_rq,		\
 	.init_hctx		= nvme_init_hctx,		\
 	.init_request		= nvme_init_request,		\
@@ -1785,9 +1767,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	}
 
 	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
-	if (max != 1 && dev->io_queues[NVMEQ_TYPE_POLL]) {
-		rw_queues = dev->io_queues[NVMEQ_TYPE_READ] +
-				dev->io_queues[NVMEQ_TYPE_WRITE];
+	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
+		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
+				dev->io_queues[HCTX_TYPE_READ];
 	} else {
 		rw_queues = max;
 	}
@@ -2076,9 +2058,9 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	 * Setup read/write queue split
 	 */
 	if (nr_io_queues == 1) {
-		dev->io_queues[NVMEQ_TYPE_READ] = 1;
-		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
-		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+		dev->io_queues[HCTX_TYPE_READ] = 0;
+		dev->io_queues[HCTX_TYPE_POLL] = 0;
 		return;
 	}
 
@@ -2095,10 +2077,10 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 			this_p_queues = nr_io_queues - 1;
 		}
 
-		dev->io_queues[NVMEQ_TYPE_POLL] = this_p_queues;
+		dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 		nr_io_queues -= this_p_queues;
 	} else
-		dev->io_queues[NVMEQ_TYPE_POLL] = 0;
+		dev->io_queues[HCTX_TYPE_POLL] = 0;
 
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
@@ -2112,11 +2094,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
 	 * a queue set.
 	 */
 	if (!this_w_queues) {
-		dev->io_queues[NVMEQ_TYPE_WRITE] = 0;
-		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_READ] = 0;
 	} else {
-		dev->io_queues[NVMEQ_TYPE_WRITE] = this_w_queues;
-		dev->io_queues[NVMEQ_TYPE_READ] = nr_io_queues - this_w_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
+		dev->io_queues[HCTX_TYPE_READ] = nr_io_queues - this_w_queues;
 	}
 }
 
@@ -2138,8 +2120,8 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 	 */
 	do {
 		nvme_calc_io_queues(dev, nr_io_queues);
-		irq_sets[0] = dev->io_queues[NVMEQ_TYPE_READ];
-		irq_sets[1] = dev->io_queues[NVMEQ_TYPE_WRITE];
+		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
+		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
 		if (!irq_sets[1])
 			affd.nr_sets = 1;
 
@@ -2226,12 +2208,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	dev->num_vecs = result;
 	result = max(result - 1, 1);
-	dev->max_qid = result + dev->io_queues[NVMEQ_TYPE_POLL];
+	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
 
-	dev_info(dev->ctrl.device, "%d/%d/%d read/write/poll queues\n",
-					dev->io_queues[NVMEQ_TYPE_READ],
-					dev->io_queues[NVMEQ_TYPE_WRITE],
-					dev->io_queues[NVMEQ_TYPE_POLL]);
+	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
+					dev->io_queues[HCTX_TYPE_DEFAULT],
+					dev->io_queues[HCTX_TYPE_READ],
+					dev->io_queues[HCTX_TYPE_POLL]);
 
 	/*
 	 * Should investigate if there's a performance win from allocating
@@ -2332,13 +2314,13 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (!dev->io_queues[NVMEQ_TYPE_POLL])
+		if (!dev->io_queues[HCTX_TYPE_POLL])
 			dev->tagset.ops = &nvme_mq_ops;
 		else
 			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
 
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
-		dev->tagset.nr_maps = NVMEQ_TYPE_NR;
+		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;
 		dev->tagset.numa_node = dev_to_node(dev->dev);
 		dev->tagset.queue_depth =
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 467f1dd21ccf..57eda7b20243 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -81,8 +81,12 @@ struct blk_mq_queue_map {
 	unsigned int queue_offset;
 };
 
-enum {
-	HCTX_MAX_TYPES = 3,
+enum hctx_type {
+	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
+	HCTX_TYPE_READ,		/* just for READ I/O */
+	HCTX_TYPE_POLL,		/* polled I/O of any kind */
+
+	HCTX_MAX_TYPES,
 };
 
 struct blk_mq_tag_set {
@@ -118,8 +122,6 @@ struct blk_mq_queue_data {
 typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *,
 		const struct blk_mq_queue_data *);
 typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *);
-/* takes rq->cmd_flags as input, returns a hardware type index */
-typedef int (rq_flags_to_type_fn)(struct request_queue *, unsigned int);
 typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *);
 typedef void (put_budget_fn)(struct blk_mq_hw_ctx *);
 typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
@@ -154,11 +156,6 @@ struct blk_mq_ops {
 	 */
 	commit_rqs_fn		*commit_rqs;
 
-	/*
-	 * Return a queue map type for the given request/bio flags
-	 */
-	rq_flags_to_type_fn	*rq_flags_to_type;
-
 	/*
 	 * Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the

From 4e224106673f1e8679249a9cac75712b896861b0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:17 +0100
Subject: [PATCH 184/351] nvme-pci: use atomic bitops to mark a queue enabled

This gets rid of all the messing with cq_vector and the ->polled field
by using an atomic bitop to mark the queue enabled or not.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 43 ++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a1bb4bb92e7f..022395a319f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -201,7 +201,8 @@ struct nvme_queue {
 	u16 last_cq_head;
 	u16 qid;
 	u8 cq_phase;
-	u8 polled;
+	unsigned long flags;
+#define NVMEQ_ENABLED		0
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -927,7 +928,7 @@ static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * We should not need to do this, but we're still using this to
 	 * ensure we can drain requests on a dying queue.
 	 */
-	if (unlikely(nvmeq->cq_vector < 0 && !nvmeq->polled))
+	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
 		return BLK_STS_IOERR;
 
 	ret = nvme_setup_cmd(ns, req, &cmnd);
@@ -1395,31 +1396,19 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
  */
 static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 {
-	int vector;
-
-	spin_lock_irq(&nvmeq->cq_lock);
-	if (nvmeq->cq_vector == -1 && !nvmeq->polled) {
-		spin_unlock_irq(&nvmeq->cq_lock);
+	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
 		return 1;
-	}
-	vector = nvmeq->cq_vector;
-	nvmeq->dev->online_queues--;
-	nvmeq->cq_vector = -1;
-	nvmeq->polled = false;
-	spin_unlock_irq(&nvmeq->cq_lock);
 
-	/*
-	 * Ensure that nvme_queue_rq() sees it ->cq_vector == -1 without
-	 * having to grab the lock.
-	 */
+	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
 	mb();
 
+	nvmeq->dev->online_queues--;
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
 		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
-
-	if (vector != -1)
-		pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq);
-
+	if (nvmeq->cq_vector == -1)
+		return 0;
+	pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
+	nvmeq->cq_vector = -1;
 	return 0;
 }
 
@@ -1578,13 +1567,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	else if (result)
 		goto release_cq;
 
-	/*
-	 * Set cq_vector after alloc cq/sq, otherwise nvme_suspend_queue will
-	 * invoke free_irq for it and cause a 'Trying to free already-free IRQ
-	 * xxx' warning if the create CQ/SQ command times out.
-	 */
 	nvmeq->cq_vector = vector;
-	nvmeq->polled = polled;
 	nvme_init_queue(nvmeq, qid);
 
 	if (vector != -1) {
@@ -1593,11 +1576,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 			goto release_sq;
 	}
 
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
 	return result;
 
 release_sq:
 	nvmeq->cq_vector = -1;
-	nvmeq->polled = false;
 	dev->online_queues--;
 	adapter_delete_sq(dev, qid);
 release_cq:
@@ -1751,6 +1734,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 		return result;
 	}
 
+	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
 	return result;
 }
 
@@ -2173,6 +2157,8 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 
 	if (nr_io_queues == 0)
 		return 0;
+	
+	clear_bit(NVMEQ_ENABLED, &adminq->flags);
 
 	if (dev->cmb_use_sqes) {
 		result = nvme_cmb_qdepth(dev, nr_io_queues,
@@ -2227,6 +2213,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 		adminq->cq_vector = -1;
 		return result;
 	}
+	set_bit(NVMEQ_ENABLED, &adminq->flags);
 	return nvme_create_io_queues(dev);
 }
 

From 6322307809649cba6f545640563f95d686ecf404 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:18 +0100
Subject: [PATCH 185/351] nvme-pci: cleanup SQ allocation a bit

Use a bit flag to mark if the SQ was allocated from the CMB, and clean
up the surrounding code a bit.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 022395a319f4..b820dd0351cb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -186,7 +186,6 @@ struct nvme_queue {
 	struct nvme_dev *dev;
 	spinlock_t sq_lock;
 	struct nvme_command *sq_cmds;
-	bool sq_cmds_is_io;
 	spinlock_t cq_lock ____cacheline_aligned_in_smp;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
@@ -203,6 +202,7 @@ struct nvme_queue {
 	u8 cq_phase;
 	unsigned long flags;
 #define NVMEQ_ENABLED		0
+#define NVMEQ_SQ_CMB		1
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
@@ -1366,17 +1366,15 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
 	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
 				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+	if (!nvmeq->sq_cmds)
+		return;
 
-	if (nvmeq->sq_cmds) {
-		if (nvmeq->sq_cmds_is_io)
-			pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
-					nvmeq->sq_cmds,
-					SQ_SIZE(nvmeq->q_depth));
-		else
-			dma_free_coherent(nvmeq->q_dmadev,
-					  SQ_SIZE(nvmeq->q_depth),
-					  nvmeq->sq_cmds,
-					  nvmeq->sq_dma_addr);
+	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
+		pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+				nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
+	} else {
+		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
 	}
 }
 
@@ -1462,15 +1460,14 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
 		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
 						nvmeq->sq_cmds);
-		nvmeq->sq_cmds_is_io = true;
-	}
-
-	if (!nvmeq->sq_cmds) {
-		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-					&nvmeq->sq_dma_addr, GFP_KERNEL);
-		nvmeq->sq_cmds_is_io = false;
+		if (nvmeq->sq_dma_addr) {
+			set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
+			return 0; 
+		}
 	}
 
+	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+				&nvmeq->sq_dma_addr, GFP_KERNEL);
 	if (!nvmeq->sq_cmds)
 		return -ENOMEM;
 	return 0;

From c6d962aebaf8ec5d867aac09ee33e3f528c2539d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:19 +0100
Subject: [PATCH 186/351] nvme-pci: only allow polling with separate poll
 queues

This will allow us to simplify both the regular NVMe interrupt handler
and the upcoming aio poll code.  In addition to that the separate
queues are generally a good idea for performance reasons.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b820dd0351cb..d42bb76e5e78 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1089,13 +1089,6 @@ static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
 }
 
 static int nvme_poll(struct blk_mq_hw_ctx *hctx)
-{
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
-	return __nvme_poll(nvmeq, -1);
-}
-
-static int nvme_poll_noirq(struct blk_mq_hw_ctx *hctx)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
 	u16 start, end;
@@ -1605,12 +1598,11 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 
 static const struct blk_mq_ops nvme_mq_ops = {
 	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll,
 };
 
-static const struct blk_mq_ops nvme_mq_poll_noirq_ops = {
+static const struct blk_mq_ops nvme_mq_poll_ops = {
 	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll_noirq,
+	.poll			= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2298,10 +2290,10 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (!dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.ops = &nvme_mq_ops;
+		if (dev->io_queues[HCTX_TYPE_POLL])
+			dev->tagset.ops = &nvme_mq_poll_ops;
 		else
-			dev->tagset.ops = &nvme_mq_poll_noirq_ops;
+			dev->tagset.ops = &nvme_mq_ops;
 
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = HCTX_MAX_TYPES;

From 0b2a8a9f4b564c7d923597828d93cd1f69ce40e0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:20 +0100
Subject: [PATCH 187/351] nvme-pci: consolidate code for polling non-dedicated
 queues

We have three places that can poll for I/O completions on a normal
interrupt-enabled queue.  All of them are in slow path code, so
consolidate them to a single helper that uses spin_lock_irqsave and
removes the fast path cqe_pending check.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 35 ++++++++++++-----------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d42bb76e5e78..10c26a2e355a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1072,17 +1072,19 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_NONE;
 }
 
-static int __nvme_poll(struct nvme_queue *nvmeq, unsigned int tag)
+/*
+ * Poll for completions any queue, including those not dedicated to polling.
+ * Can be called from any context.
+ */
+static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
+	unsigned long flags;
 	u16 start, end;
 	int found;
 
-	if (!nvme_cqe_pending(nvmeq))
-		return 0;
-
-	spin_lock_irq(&nvmeq->cq_lock);
+	spin_lock_irqsave(&nvmeq->cq_lock, flags);
 	found = nvme_process_cq(nvmeq, &start, &end, tag);
-	spin_unlock_irq(&nvmeq->cq_lock);
+	spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1279,7 +1281,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	/*
 	 * Did we miss an interrupt?
 	 */
-	if (__nvme_poll(nvmeq, req->tag)) {
+	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, completion polled\n",
 			 req->tag, nvmeq->qid);
@@ -1406,18 +1408,13 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
 {
 	struct nvme_queue *nvmeq = &dev->queues[0];
-	u16 start, end;
 
 	if (shutdown)
 		nvme_shutdown_ctrl(&dev->ctrl);
 	else
 		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
 
-	spin_lock_irq(&nvmeq->cq_lock);
-	nvme_process_cq(nvmeq, &start, &end, -1);
-	spin_unlock_irq(&nvmeq->cq_lock);
-
-	nvme_complete_cqes(nvmeq, start, end);
+	nvme_poll_irqdisable(nvmeq, -1);
 }
 
 static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
@@ -2217,17 +2214,9 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
-	u16 start, end;
 
-	if (!error) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&nvmeq->cq_lock, flags);
-		nvme_process_cq(nvmeq, &start, &end, -1);
-		spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
-
-		nvme_complete_cqes(nvmeq, start, end);
-	}
+	if (!error)
+		nvme_poll_irqdisable(nvmeq, -1);
 
 	nvme_del_queue_end(req, error);
 }

From 5271edd41dd895773d3105f5065bbf5ded0dee7c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:21 +0100
Subject: [PATCH 188/351] nvme-pci: refactor nvme_disable_io_queues

Pass the opcode for the delete SQ/CQ command as an argument instead of
the somewhat confusing pass loop.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10c26a2e355a..9ceba9900ca3 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2244,31 +2244,29 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	return 0;
 }
 
-static void nvme_disable_io_queues(struct nvme_dev *dev)
+static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 {
-	int pass, queues = dev->online_queues - 1;
+	int nr_queues = dev->online_queues - 1, sent = 0;
 	unsigned long timeout;
-	u8 opcode = nvme_admin_delete_sq;
 
-	for (pass = 0; pass < 2; pass++) {
-		int sent = 0, i = queues;
-
-		reinit_completion(&dev->ioq_wait);
+	reinit_completion(&dev->ioq_wait);
  retry:
-		timeout = ADMIN_TIMEOUT;
-		for (; i > 0; i--, sent++)
-			if (nvme_delete_queue(&dev->queues[i], opcode))
-				break;
-
-		while (sent--) {
-			timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout);
-			if (timeout == 0)
-				return;
-			if (i)
-				goto retry;
-		}
-		opcode = nvme_admin_delete_cq;
+	timeout = ADMIN_TIMEOUT;
+	while (nr_queues > 0) {
+		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
+			break;
+		nr_queues--;
+		sent++;
 	}
+	while (sent--) {
+		timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+				timeout);
+		if (timeout == 0)
+			return false;
+		if (nr_queues)
+			goto retry;
+	}
+	return true;
 }
 
 /*
@@ -2428,7 +2426,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	nvme_stop_queues(&dev->ctrl);
 
 	if (!dead && dev->ctrl.queue_count > 0) {
-		nvme_disable_io_queues(dev);
+		if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
+			nvme_disable_io_queues(dev, nvme_admin_delete_cq);
 		nvme_disable_admin_queue(dev, shutdown);
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)

From d1ed6aa14bc418531220478604c7b12c5e98fdca Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:22 +0100
Subject: [PATCH 189/351] nvme-pci: don't poll from irq context when deleting
 queues

This is the last place outside of nvme_irq that handles CQEs from
interrupt context, and thus is in the way of removing the cq_lock for
normal queues, and avoiding lockdep warnings on the poll queues, for
which we already take it without IRQ disabling.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9ceba9900ca3..2d5a468c35b1 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -122,7 +122,6 @@ struct nvme_dev {
 	u32 cmbsz;
 	u32 cmbloc;
 	struct nvme_ctrl ctrl;
-	struct completion ioq_wait;
 
 	mempool_t *iod_mempool;
 
@@ -203,10 +202,12 @@ struct nvme_queue {
 	unsigned long flags;
 #define NVMEQ_ENABLED		0
 #define NVMEQ_SQ_CMB		1
+#define NVMEQ_DELETE_ERROR	2
 	u32 *dbbuf_sq_db;
 	u32 *dbbuf_cq_db;
 	u32 *dbbuf_sq_ei;
 	u32 *dbbuf_cq_ei;
+	struct completion delete_done;
 };
 
 /*
@@ -1535,6 +1536,8 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
 	int result;
 	s16 vector;
 
+	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
+
 	/*
 	 * A queue's vector matches the queue identifier unless the controller
 	 * has only one vector available.
@@ -2208,15 +2211,15 @@ static void nvme_del_queue_end(struct request *req, blk_status_t error)
 	struct nvme_queue *nvmeq = req->end_io_data;
 
 	blk_mq_free_request(req);
-	complete(&nvmeq->dev->ioq_wait);
+	complete(&nvmeq->delete_done);
 }
 
 static void nvme_del_cq_end(struct request *req, blk_status_t error)
 {
 	struct nvme_queue *nvmeq = req->end_io_data;
 
-	if (!error)
-		nvme_poll_irqdisable(nvmeq, -1);
+	if (error)
+		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
 
 	nvme_del_queue_end(req, error);
 }
@@ -2238,6 +2241,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
 	req->timeout = ADMIN_TIMEOUT;
 	req->end_io_data = nvmeq;
 
+	init_completion(&nvmeq->delete_done);
 	blk_execute_rq_nowait(q, NULL, req, false,
 			opcode == nvme_admin_delete_cq ?
 				nvme_del_cq_end : nvme_del_queue_end);
@@ -2249,7 +2253,6 @@ static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 	int nr_queues = dev->online_queues - 1, sent = 0;
 	unsigned long timeout;
 
-	reinit_completion(&dev->ioq_wait);
  retry:
 	timeout = ADMIN_TIMEOUT;
 	while (nr_queues > 0) {
@@ -2258,11 +2261,20 @@ static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
 		nr_queues--;
 		sent++;
 	}
-	while (sent--) {
-		timeout = wait_for_completion_io_timeout(&dev->ioq_wait,
+	while (sent) {
+		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];
+
+		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
 				timeout);
 		if (timeout == 0)
 			return false;
+
+		/* handle any remaining CQEs */
+		if (opcode == nvme_admin_delete_cq &&
+		    !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
+			nvme_poll_irqdisable(nvmeq, -1);
+
+		sent--;
 		if (nr_queues)
 			goto retry;
 	}
@@ -2746,7 +2758,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
 	mutex_init(&dev->shutdown_lock);
-	init_completion(&dev->ioq_wait);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)

From 3a7afd8ee42a68d4f24ab9c947a4ef82d4d52375 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:23 +0100
Subject: [PATCH 190/351] nvme-pci: remove the CQ lock for interrupt driven
 queues

Now that we can't poll regular, interrupt driven I/O queues there
is almost nothing that can race with an interrupt.  The only
possible other contexts polling a CQ are the error handler and
queue shutdown, and both are so far off in the slow path that
we can simply use the big hammer of disabling interrupts.

With that we can stop taking the cq_lock for normal queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2d5a468c35b1..4ccb4ea22ac6 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -185,7 +185,8 @@ struct nvme_queue {
 	struct nvme_dev *dev;
 	spinlock_t sq_lock;
 	struct nvme_command *sq_cmds;
-	spinlock_t cq_lock ____cacheline_aligned_in_smp;
+	 /* only used for poll queues: */
+	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
 	volatile struct nvme_completion *cqes;
 	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
@@ -1050,12 +1051,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
 	irqreturn_t ret = IRQ_NONE;
 	u16 start, end;
 
-	spin_lock(&nvmeq->cq_lock);
+	/*
+	 * The rmb/wmb pair ensures we see all updates from a previous run of
+	 * the irq handler, even if that was on another CPU.
+	 */
+	rmb();
 	if (nvmeq->cq_head != nvmeq->last_cq_head)
 		ret = IRQ_HANDLED;
 	nvme_process_cq(nvmeq, &start, &end, -1);
 	nvmeq->last_cq_head = nvmeq->cq_head;
-	spin_unlock(&nvmeq->cq_lock);
+	wmb();
 
 	if (start != end) {
 		nvme_complete_cqes(nvmeq, start, end);
@@ -1079,13 +1084,24 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
  */
 static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
 {
-	unsigned long flags;
+	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
 	u16 start, end;
 	int found;
 
-	spin_lock_irqsave(&nvmeq->cq_lock, flags);
+	/*
+	 * For a poll queue we need to protect against the polling thread
+	 * using the CQ lock.  For normal interrupt driven threads we have
+	 * to disable the interrupt to avoid racing with it.
+	 */
+	if (nvmeq->cq_vector == -1)
+		spin_lock(&nvmeq->cq_poll_lock);
+	else
+		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 	found = nvme_process_cq(nvmeq, &start, &end, tag);
-	spin_unlock_irqrestore(&nvmeq->cq_lock, flags);
+	if (nvmeq->cq_vector == -1)
+		spin_unlock(&nvmeq->cq_poll_lock);
+	else
+		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1100,9 +1116,9 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
 	if (!nvme_cqe_pending(nvmeq))
 		return 0;
 
-	spin_lock(&nvmeq->cq_lock);
+	spin_lock(&nvmeq->cq_poll_lock);
 	found = nvme_process_cq(nvmeq, &start, &end, -1);
-	spin_unlock(&nvmeq->cq_lock);
+	spin_unlock(&nvmeq->cq_poll_lock);
 
 	nvme_complete_cqes(nvmeq, start, end);
 	return found;
@@ -1482,7 +1498,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
 	nvmeq->q_dmadev = dev->dev;
 	nvmeq->dev = dev;
 	spin_lock_init(&nvmeq->sq_lock);
-	spin_lock_init(&nvmeq->cq_lock);
+	spin_lock_init(&nvmeq->cq_poll_lock);
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
@@ -1518,7 +1534,6 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 {
 	struct nvme_dev *dev = nvmeq->dev;
 
-	spin_lock_irq(&nvmeq->cq_lock);
 	nvmeq->sq_tail = 0;
 	nvmeq->last_sq_tail = 0;
 	nvmeq->cq_head = 0;
@@ -1527,7 +1542,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
 	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
 	nvme_dbbuf_init(dev, nvmeq, qid);
 	dev->online_queues++;
-	spin_unlock_irq(&nvmeq->cq_lock);
+	wmb(); /* ensure the first interrupt sees the initialization */
 }
 
 static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)

From f9801a484ad6dcc33b10c61b143efc3352541802 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:24 +0100
Subject: [PATCH 191/351] nvme-rdma: remove I/O polling support

The code was always a bit of a hack that digs far too much into
RDMA core internals.  Lets kick it out and reimplement proper
dedicated poll queues as needed.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/rdma.c | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 3ca096c1a506..75c01d20a133 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1738,29 +1738,6 @@ err:
 	return BLK_STS_IOERR;
 }
 
-static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
-{
-	struct nvme_rdma_queue *queue = hctx->driver_data;
-	struct ib_cq *cq = queue->ib_cq;
-	struct ib_wc wc;
-	int found = 0;
-
-	while (ib_poll_cq(cq, 1, &wc) > 0) {
-		struct ib_cqe *cqe = wc.wr_cqe;
-
-		if (cqe) {
-			if (cqe->done == nvme_rdma_recv_done) {
-				nvme_rdma_recv_done(cq, &wc);
-				found++;
-			} else {
-				cqe->done(cq, &wc);
-			}
-		}
-	}
-
-	return found;
-}
-
 static void nvme_rdma_complete_rq(struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1782,7 +1759,6 @@ static const struct blk_mq_ops nvme_rdma_mq_ops = {
 	.init_request	= nvme_rdma_init_request,
 	.exit_request	= nvme_rdma_exit_request,
 	.init_hctx	= nvme_rdma_init_hctx,
-	.poll		= nvme_rdma_poll,
 	.timeout	= nvme_rdma_timeout,
 	.map_queues	= nvme_rdma_map_queues,
 };

From 9d6610b76fa374eae3deb93bcbace4a06c2e3b95 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:25 +0100
Subject: [PATCH 192/351] nvme-mpath: remove I/O polling support

The ->poll_fn has been stale for a while, as a lot of places check for mq
ops.  But there is no real point in it anyway, as we don't even use
the multipath code for subsystems without multiple ports, which is usually
what we do high performance I/O to.  If it really becomes an issue we
should rework the nvme code to also skip the multipath code for any
private namespace, even if that could mean some trouble when rescanning.

Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/multipath.c | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ffebdd0ae34b..ec310b1b9267 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -220,21 +220,6 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	return ret;
 }
 
-static int nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc, bool spin)
-{
-	struct nvme_ns_head *head = q->queuedata;
-	struct nvme_ns *ns;
-	int found = 0;
-	int srcu_idx;
-
-	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
-	if (likely(ns && nvme_path_is_optimized(ns)))
-		found = ns->queue->poll_fn(q, qc, spin);
-	srcu_read_unlock(&head->srcu, srcu_idx);
-	return found;
-}
-
 static void nvme_requeue_work(struct work_struct *work)
 {
 	struct nvme_ns_head *head =
@@ -281,7 +266,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 		goto out;
 	q->queuedata = head;
 	blk_queue_make_request(q, nvme_ns_head_make_request);
-	q->poll_fn = nvme_ns_head_poll;
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
 	/* set to a default value for 512 until disk is validated */
 	blk_queue_logical_block_size(q, 512);

From 529262d56dbebe6a26df5d2fd24cc0e1bc8579e5 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:26 +0100
Subject: [PATCH 193/351] block: remove ->poll_fn

This was intended to support users like nvme multipath, but is just
getting in the way and adding another indirect call.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 23 -----------------------
 block/blk-mq.c         | 24 +++++++++++++++++++-----
 include/linux/blkdev.h |  2 --
 3 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a1a5e1c14898..ad59102ee30a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1250,29 +1250,6 @@ blk_qc_t submit_bio(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio);
 
-/**
- * blk_poll - poll for IO completions
- * @q:  the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- *    Poll for completions on the passed in queue. Returns number of
- *    completed entries found. If @spin is true, then blk_poll will continue
- *    looping until at least one completion is found, unless the task is
- *    otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
-{
-	if (!q->poll_fn || !blk_qc_t_valid(cookie))
-		return 0;
-
-	if (current->plug)
-		blk_flush_plug_list(current->plug, false);
-	return q->poll_fn(q, cookie, spin);
-}
-EXPORT_SYMBOL_GPL(blk_poll);
-
 /**
  * blk_cloned_rq_check_limits - Helper function to check a cloned request
  *                              for new the queue limits
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e09d7f500077..50d529602e05 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -38,7 +38,6 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2838,8 +2837,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	spin_lock_init(&q->requeue_lock);
 
 	blk_queue_make_request(q, blk_mq_make_request);
-	if (q->mq_ops->poll)
-		q->poll_fn = blk_mq_poll;
 
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
@@ -3400,14 +3397,30 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
 	return blk_mq_poll_hybrid_sleep(q, hctx, rq);
 }
 
-static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
+/**
+ * blk_poll - poll for IO completions
+ * @q:  the queue
+ * @cookie: cookie passed back at IO submission time
+ * @spin: whether to spin for completions
+ *
+ * Description:
+ *    Poll for completions on the passed in queue. Returns number of
+ *    completed entries found. If @spin is true, then blk_poll will continue
+ *    looping until at least one completion is found, unless the task is
+ *    otherwise marked running (or we need to reschedule).
+ */
+int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 {
 	struct blk_mq_hw_ctx *hctx;
 	long state;
 
-	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+	if (!blk_qc_t_valid(cookie) ||
+	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return 0;
 
+	if (current->plug)
+		blk_flush_plug_list(current->plug, false);
+
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
 
 	/*
@@ -3448,6 +3461,7 @@ static int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
 	__set_current_state(TASK_RUNNING);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(blk_poll);
 
 unsigned int blk_mq_rq_cpu(struct request *rq)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 08d940f85fa0..0b3874bdbc6a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -283,7 +283,6 @@ static inline unsigned short req_get_ioprio(struct request *req)
 struct blk_queue_ctx;
 
 typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio);
-typedef int (poll_q_fn) (struct request_queue *q, blk_qc_t, bool spin);
 
 struct bio_vec;
 typedef int (dma_drain_needed_fn)(struct request *);
@@ -401,7 +400,6 @@ struct request_queue {
 	struct rq_qos		*rq_qos;
 
 	make_request_fn		*make_request_fn;
-	poll_q_fn		*poll_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 
 	const struct blk_mq_ops	*mq_ops;

From 376f7ef8bfeaee3993c2e85df1bbaa06725b9342 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:27 +0100
Subject: [PATCH 194/351] block: only allow polling if a poll queue_map exists

This avoids having to have differnet mq_ops for different setups
with or without poll queues.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-sysfs.c       |  2 +-
 drivers/nvme/host/pci.c | 29 +++++++++--------------------
 2 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9f0cb370b39b..bb7c642ffefa 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -402,7 +402,7 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	unsigned long poll_on;
 	ssize_t ret;
 
-	if (!q->mq_ops || !q->mq_ops->poll)
+	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL)
 		return -EINVAL;
 
 	ret = queue_var_store(&poll_on, page, count);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4ccb4ea22ac6..7732c4979a4e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1602,22 +1602,15 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.timeout	= nvme_timeout,
 };
 
-#define NVME_SHARED_MQ_OPS					\
-	.queue_rq		= nvme_queue_rq,		\
-	.commit_rqs		= nvme_commit_rqs,		\
-	.complete		= nvme_pci_complete_rq,		\
-	.init_hctx		= nvme_init_hctx,		\
-	.init_request		= nvme_init_request,		\
-	.map_queues		= nvme_pci_map_queues,		\
-	.timeout		= nvme_timeout			\
-
 static const struct blk_mq_ops nvme_mq_ops = {
-	NVME_SHARED_MQ_OPS,
-};
-
-static const struct blk_mq_ops nvme_mq_poll_ops = {
-	NVME_SHARED_MQ_OPS,
-	.poll			= nvme_poll,
+	.queue_rq	= nvme_queue_rq,
+	.complete	= nvme_pci_complete_rq,
+	.commit_rqs	= nvme_commit_rqs,
+	.init_hctx	= nvme_init_hctx,
+	.init_request	= nvme_init_request,
+	.map_queues	= nvme_pci_map_queues,
+	.timeout	= nvme_timeout,
+	.poll		= nvme_poll,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
@@ -2304,11 +2297,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int ret;
 
 	if (!dev->ctrl.tagset) {
-		if (dev->io_queues[HCTX_TYPE_POLL])
-			dev->tagset.ops = &nvme_mq_poll_ops;
-		else
-			dev->tagset.ops = &nvme_mq_ops;
-
+		dev->tagset.ops = &nvme_mq_ops;
 		dev->tagset.nr_hw_queues = dev->online_queues - 1;
 		dev->tagset.nr_maps = HCTX_MAX_TYPES;
 		dev->tagset.timeout = NVME_IO_TIMEOUT;

From 6544d229bf433b755e77800002e078e54cd9b42b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 2 Dec 2018 17:46:28 +0100
Subject: [PATCH 195/351] block: enable polling by default if a poll map is
 initalized

If the user did setup polling in the driver we should not require
another know in the block layer to enable it.

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 50d529602e05..eabc7fcd96db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2826,6 +2826,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
+	if (set->nr_maps > HCTX_TYPE_POLL)
+		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
 		blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);

From 6e0de61107f03c3222550d9b548cd331d31d82d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 5 Dec 2018 06:50:40 -0700
Subject: [PATCH 196/351] blk-mq: remove QUEUE_FLAG_POLL from default MQ flags

We only support polling if we have poll queues now, but the flag is
being set by default. Remove the default QUEUE_FLAG_POLL setting, we'll
set it in blk_mq_init_allocated_queue() if we have poll queues available
for this device.

Fixes: 6544d229bf43 ("block: enable polling by default if a poll map is initalized")
Reported-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0b3874bdbc6a..81f1b105946b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -606,8 +606,7 @@ struct request_queue {
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
-				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
-				 (1 << QUEUE_FLAG_POLL))
+				 (1 << QUEUE_FLAG_SAME_COMP))
 
 void blk_queue_flag_set(unsigned int flag, struct request_queue *q);
 void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);

From 0fe061b9f03c27d0370888efc22d4b3ac7af90cf Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:26 -0500
Subject: [PATCH 197/351] blkcg: fix ref count issue with bio_blkcg() using
 task_css

The bio_blkcg() function turns out to be inconsistent and consequently
dangerous to use. The first part returns a blkcg where a reference is
owned by the bio meaning it does not need to be rcu protected. However,
the third case, the last line, is problematic:

	return css_to_blkcg(task_css(current, io_cgrp_id));

This can race against task migration and the cgroup dying. It is also
semantically different as it must be called rcu protected and is
susceptible to failure when trying to get a reference to it.

This patch adds association ahead of calling bio_blkcg() rather than
after. This makes association a required and explicit step along the
code paths for calling bio_blkcg(). In blk-iolatency, association is
moved above the bio_blkcg() call to ensure it will not return %NULL.

BFQ uses the old bio_blkcg() function, but I do not want to address it
in this series due to the complexity. I have created a private version
documenting the inconsistency and noting not to use it.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bfq-cgroup.c         |   4 +-
 block/bfq-iosched.c        |   2 +-
 block/bio.c                |  10 +++-
 block/blk-iolatency.c      |   2 +-
 include/linux/blk-cgroup.h | 100 +++++++++++++++++++++++++++++++++----
 5 files changed, 103 insertions(+), 15 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index a7a1712632b0..c6113af31960 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	uint64_t serial_nr;
 
 	rcu_read_lock();
-	serial_nr = bio_blkcg(bio)->css.serial_nr;
+	serial_nr = __bio_blkcg(bio)->css.serial_nr;
 
 	/*
 	 * Check whether blkcg has changed.  The condition may trigger
@@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
 	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
 		goto out;
 
-	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
 	/*
 	 * Update blkg_path for bfq_log_* functions. We cache this
 	 * path, and update it here, for the following
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 67b22c924aee..3d1f319fe977 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -4384,7 +4384,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
 
 	rcu_read_lock();
 
-	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
 	if (!bfqg) {
 		bfqq = &bfqd->oom_bfqq;
 		goto out;
diff --git a/block/bio.c b/block/bio.c
index 03895cc0d74a..346a7f5cb2dd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1990,13 +1990,19 @@ int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
  *
  * This function takes an extra reference of @blkcg_css which will be put
  * when @bio is released.  The caller must own @bio and is responsible for
- * synchronizing calls to this function.
+ * synchronizing calls to this function.  If @blkcg_css is %NULL, a call to
+ * blkcg_get_css() finds the current css from the kthread or task.
  */
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 {
 	if (unlikely(bio->bi_css))
 		return -EBUSY;
-	css_get(blkcg_css);
+
+	if (blkcg_css)
+		css_get(blkcg_css);
+	else
+		blkcg_css = blkcg_get_css();
+
 	bio->bi_css = blkcg_css;
 	return 0;
 }
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 5f7f1773be61..fe0c4ca312ff 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -481,8 +481,8 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 		return;
 
 	rcu_read_lock();
+	bio_associate_blkcg(bio, NULL);
 	blkcg = bio_blkcg(bio);
-	bio_associate_blkcg(bio, &blkcg->css);
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(&q->queue_lock);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a9e2e2037129..f619307171a6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -227,22 +227,103 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static inline struct cgroup_subsys_state *blkcg_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	css = kthread_blkcg();
+	if (css)
+		return css;
+	return task_css(current, io_cgrp_id);
+}
+
+/**
+ * blkcg_get_css - find and get a reference to the css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This takes a reference on the blkcg which will need to be managed by the
+ * caller.
+ */
+static inline struct cgroup_subsys_state *blkcg_get_css(void)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+
+	css = kthread_blkcg();
+	if (css) {
+		css_get(css);
+	} else {
+		/*
+		 * This is a bit complicated.  It is possible task_css() is
+		 * seeing an old css pointer here.  This is caused by the
+		 * current thread migrating away from this cgroup and this
+		 * cgroup dying.  css_tryget() will fail when trying to take a
+		 * ref on a cgroup that's ref count has hit 0.
+		 *
+		 * Therefore, if it does fail, this means current must have
+		 * been swapped away already and this is waiting for it to
+		 * propagate on the polling cpu.  Hence the use of cpu_relax().
+		 */
+		while (true) {
+			css = task_css(current, io_cgrp_id);
+			if (likely(css_tryget(css)))
+				break;
+			cpu_relax();
+		}
+	}
+
+	rcu_read_unlock();
+
+	return css;
+}
 
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
 }
 
-static inline struct blkcg *bio_blkcg(struct bio *bio)
+/**
+ * __bio_blkcg - internal, inconsistent version to get blkcg
+ *
+ * DO NOT USE.
+ * This function is inconsistent and consequently is dangerous to use.  The
+ * first part of the function returns a blkcg where a reference is owned by the
+ * bio.  This means it does not need to be rcu protected as it cannot go away
+ * with the bio owning a reference to it.  However, the latter potentially gets
+ * it from task_css().  This can race against task migration and the cgroup
+ * dying.  It is also semantically different as it must be called rcu protected
+ * and is susceptible to failure when trying to get a reference to it.
+ * Therefore, it is not ok to assume that *_get() will always succeed on the
+ * blkcg returned here.
+ */
+static inline struct blkcg *__bio_blkcg(struct bio *bio)
 {
-	struct cgroup_subsys_state *css;
-
 	if (bio && bio->bi_css)
 		return css_to_blkcg(bio->bi_css);
-	css = kthread_blkcg();
-	if (css)
-		return css_to_blkcg(css);
-	return css_to_blkcg(task_css(current, io_cgrp_id));
+	return css_to_blkcg(blkcg_css());
+}
+
+/**
+ * bio_blkcg - grab the blkcg associated with a bio
+ * @bio: target bio
+ *
+ * This returns the blkcg associated with a bio, %NULL if not associated.
+ * Callers are expected to either handle %NULL or know association has been
+ * done prior to calling this.
+ */
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+	if (bio && bio->bi_css)
+		return css_to_blkcg(bio->bi_css);
+	return NULL;
 }
 
 static inline bool blk_cgroup_congested(void)
@@ -710,10 +791,10 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	bool throtl = false;
 
 	rcu_read_lock();
-	blkcg = bio_blkcg(bio);
 
 	/* associate blkcg if bio hasn't attached one */
-	bio_associate_blkcg(bio, &blkcg->css);
+	bio_associate_blkcg(bio, NULL);
+	blkcg = bio_blkcg(bio);
 
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
@@ -835,6 +916,7 @@ static inline int blkcg_activate_policy(struct request_queue *q,
 static inline void blkcg_deactivate_policy(struct request_queue *q,
 					   const struct blkcg_policy *pol) { }
 
+static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 
 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,

From b978962ad4f7f9c06e5aa07b2a9b22f6d600456c Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:27 -0500
Subject: [PATCH 198/351] blkcg: update blkg_lookup_create() to do locking

To know when to create a blkg, the general pattern is to do a
blkg_lookup() and if that fails, lock and do the lookup again, and if
that fails finally create. It doesn't make much sense for everyone who
wants to do creation to write this themselves.

This changes blkg_lookup_create() to do locking and implement this
pattern. The old blkg_lookup_create() is renamed to
__blkg_lookup_create().  If a call site wants to do its own error
handling or already owns the queue lock, they can use
__blkg_lookup_create(). This will be used in upcoming patches.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 28 +++++++++++++++++++++++++---
 block/blk-iolatency.c      |  2 +-
 include/linux/blk-cgroup.h |  4 +++-
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 63d226a084cd..b421a9457e05 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -249,7 +249,7 @@ err_free_blkg:
 }
 
 /**
- * blkg_lookup_create - lookup blkg, try to create one if not there
+ * __blkg_lookup_create - lookup blkg, try to create one if not there
  * @blkcg: blkcg of interest
  * @q: request_queue of interest
  *
@@ -262,8 +262,8 @@ err_free_blkg:
  * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
  * dead and bypassing, returns ERR_PTR(-EBUSY).
  */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-				    struct request_queue *q)
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+				      struct request_queue *q)
 {
 	struct blkcg_gq *blkg;
 
@@ -293,6 +293,28 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 	}
 }
 
+/**
+ * blkg_lookup_create - find or create a blkg
+ * @blkcg: target block cgroup
+ * @q: target request_queue
+ *
+ * This looks up or creates the blkg representing the unique pair
+ * of the blkcg and the request_queue.
+ */
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+				    struct request_queue *q)
+{
+	struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
+
+	if (unlikely(!blkg)) {
+		spin_lock_irq(&q->queue_lock);
+		blkg = __blkg_lookup_create(blkcg, q);
+		spin_unlock_irq(&q->queue_lock);
+	}
+
+	return blkg;
+}
+
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
 	struct blkcg *blkcg = blkg->blkcg;
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index fe0c4ca312ff..e6f68f15dee9 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -486,7 +486,7 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(&q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
+		blkg = __blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(&q->queue_lock);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f619307171a6..b3b1a8187d23 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -181,6 +181,8 @@ extern struct cgroup_subsys_state * const blkcg_root_css;
 
 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 				      struct request_queue *q, bool update_hint);
+struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
+				      struct request_queue *q);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -799,7 +801,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	blkg = blkg_lookup(blkcg, q);
 	if (unlikely(!blkg)) {
 		spin_lock_irq(&q->queue_lock);
-		blkg = blkg_lookup_create(blkcg, q);
+		blkg = __blkg_lookup_create(blkcg, q);
 		if (IS_ERR(blkg))
 			blkg = NULL;
 		spin_unlock_irq(&q->queue_lock);

From beea9da07d8a6228a7e4a31a83f9478d513bf03f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:28 -0500
Subject: [PATCH 199/351] blkcg: convert blkg_lookup_create() to find closest
 blkg

There are several scenarios where blkg_lookup_create() can fail such as
the blkcg dying, request_queue is dying, or simply being OOM. Most
handle this by simply falling back to the q->root_blkg and calling it a
day.

This patch implements the notion of closest blkg. During
blkg_lookup_create(), if it fails to create, return the closest blkg
found or the q->root_blkg. blkg_try_get_closest() is introduced and used
during association so a bio is always attached to a blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 17 ++++++++++-------
 block/blk-cgroup.c         | 21 +++++++++++++++------
 block/blk-iolatency.c      | 14 ++------------
 block/blk-throttle.c       |  4 +---
 include/linux/blk-cgroup.h | 24 +++++++++++++++---------
 5 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 346a7f5cb2dd..5c9828524adc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2009,21 +2009,24 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
 /**
- * bio_associate_blkg - associate a bio with the specified blkg
+ * bio_associate_blkg - associate a bio with the a blkg
  * @bio: target bio
  * @blkg: the blkg to associate
  *
- * Associate @bio with the blkg specified by @blkg.  This is the queue specific
- * blkcg information associated with the @bio, a reference will be taken on the
- * @blkg and will be freed when the bio is freed.
+ * This tries to associate @bio with the specified @blkg.  Association failure
+ * is handled by walking up the blkg tree.  Therefore, the blkg associated can
+ * be anything between @blkg and the root_blkg.  This situation only happens
+ * when a cgroup is dying and then the remaining bios will spill to the closest
+ * alive blkg.
+ *
+ * A reference will be taken on the @blkg and will be released when @bio is
+ * freed.
  */
 int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 {
 	if (unlikely(bio->bi_blkg))
 		return -EBUSY;
-	if (!blkg_try_get(blkg))
-		return -ENODEV;
-	bio->bi_blkg = blkg;
+	bio->bi_blkg = blkg_try_get_closest(blkg);
 	return 0;
 }
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b421a9457e05..120f2e2835fb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -258,9 +258,8 @@ err_free_blkg:
  * that all non-root blkg's have access to the parent blkg.  This function
  * should be called under RCU read lock and @q->queue_lock.
  *
- * Returns pointer to the looked up or created blkg on success, ERR_PTR()
- * value on error.  If @q is dead, returns ERR_PTR(-EINVAL).  If @q is not
- * dead and bypassing, returns ERR_PTR(-EBUSY).
+ * Returns the blkg or the closest blkg if blkg_create() fails as it walks
+ * down from root.
  */
 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 				      struct request_queue *q)
@@ -276,19 +275,29 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 
 	/*
 	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
-	 * non-root blkgs have access to their parents.
+	 * non-root blkgs have access to their parents.  Returns the closest
+	 * blkg to the intended blkg should blkg_create() fail.
 	 */
 	while (true) {
 		struct blkcg *pos = blkcg;
 		struct blkcg *parent = blkcg_parent(blkcg);
+		struct blkcg_gq *ret_blkg = q->root_blkg;
 
-		while (parent && !__blkg_lookup(parent, q, false)) {
+		while (parent) {
+			blkg = __blkg_lookup(parent, q, false);
+			if (blkg) {
+				/* remember closest blkg */
+				ret_blkg = blkg;
+				break;
+			}
 			pos = parent;
 			parent = blkcg_parent(parent);
 		}
 
 		blkg = blkg_create(pos, q, NULL);
-		if (pos == blkcg || IS_ERR(blkg))
+		if (IS_ERR(blkg))
+			return ret_blkg;
+		if (pos == blkcg)
 			return blkg;
 	}
 }
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index e6f68f15dee9..46e86c34cf79 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -483,21 +483,11 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 	rcu_read_lock();
 	bio_associate_blkcg(bio, NULL);
 	blkcg = bio_blkcg(bio);
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg)) {
-		spin_lock_irq(&q->queue_lock);
-		blkg = __blkg_lookup_create(blkcg, q);
-		if (IS_ERR(blkg))
-			blkg = NULL;
-		spin_unlock_irq(&q->queue_lock);
-	}
-	if (!blkg)
-		goto out;
-
+	blkg = blkg_lookup_create(blkcg, q);
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 	bio_associate_blkg(bio, blkg);
-out:
 	rcu_read_unlock();
+
 	while (blkg && blkg->parent) {
 		struct iolatency_grp *iolat = blkg_to_lat(blkg);
 		if (!iolat) {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 8f0a104770ee..d648d6720f46 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2118,9 +2118,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	/* fallback to root_blkg if we fail to get a blkg ref */
-	if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV))
-		bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg);
+	bio_associate_blkg(bio, tg_to_blkg(tg));
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b3b1a8187d23..c08e96e521ed 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -545,6 +545,20 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
 	return NULL;
 }
 
+/**
+ * blkg_try_get_closest - try and get a blkg ref on the closet blkg
+ * @blkg: blkg to get
+ *
+ * This walks up the blkg tree to find the closest non-dying blkg and returns
+ * the blkg that it did association with as it may not be the passed in blkg.
+ */
+static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
+{
+	while (!atomic_inc_not_zero(&blkg->refcnt))
+		blkg = blkg->parent;
+
+	return blkg;
+}
 
 void __blkg_release_rcu(struct rcu_head *rcu);
 
@@ -797,15 +811,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	/* associate blkcg if bio hasn't attached one */
 	bio_associate_blkcg(bio, NULL);
 	blkcg = bio_blkcg(bio);
-
-	blkg = blkg_lookup(blkcg, q);
-	if (unlikely(!blkg)) {
-		spin_lock_irq(&q->queue_lock);
-		blkg = __blkg_lookup_create(blkcg, q);
-		if (IS_ERR(blkg))
-			blkg = NULL;
-		spin_unlock_irq(&q->queue_lock);
-	}
+	blkg = blkg_lookup_create(blkcg, q);
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 

From 2268c0feb0ffb1c1bb6e1d4d5505d30f485aa77b Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:29 -0500
Subject: [PATCH 200/351] blkcg: introduce common blkg association logic

There are 3 ways blkg association can happen: association with the
current css, with the page css (swap), or from the wbc css (writeback).

This patch handles how association is done for the first case where we
are associating bsaed on the current css. If there is already a blkg
associated, the css will be reused and association will be redone as the
request_queue may have changed.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c           | 62 ++++++++++++++++++++++++++++++++++++-------
 block/blk-iolatency.c | 10 ++-----
 block/blk-throttle.c  |  6 ++---
 include/linux/bio.h   |  5 +++-
 4 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 5c9828524adc..452b8e79b998 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2009,7 +2009,21 @@ int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
 EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 
 /**
- * bio_associate_blkg - associate a bio with the a blkg
+ * bio_disassociate_blkg - puts back the blkg reference if associated
+ * @bio: target bio
+ *
+ * Helper to disassociate the blkg from @bio if a blkg is associated.
+ */
+void bio_disassociate_blkg(struct bio *bio)
+{
+	if (bio->bi_blkg) {
+		blkg_put(bio->bi_blkg);
+		bio->bi_blkg = NULL;
+	}
+}
+
+/**
+ * __bio_associate_blkg - associate a bio with the a blkg
  * @bio: target bio
  * @blkg: the blkg to associate
  *
@@ -2022,12 +2036,42 @@ EXPORT_SYMBOL_GPL(bio_associate_blkcg);
  * A reference will be taken on the @blkg and will be released when @bio is
  * freed.
  */
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
+static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 {
-	if (unlikely(bio->bi_blkg))
-		return -EBUSY;
+	bio_disassociate_blkg(bio);
+
 	bio->bi_blkg = blkg_try_get_closest(blkg);
-	return 0;
+}
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+	struct request_queue *q = bio->bi_disk->queue;
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	bio_associate_blkcg(bio, NULL);
+	blkcg = bio_blkcg(bio);
+
+	if (!blkcg->css.parent) {
+		__bio_associate_blkg(bio, q->root_blkg);
+	} else {
+		blkg = blkg_lookup_create(blkcg, q);
+
+		__bio_associate_blkg(bio, blkg);
+	}
+
+	rcu_read_unlock();
 }
 
 /**
@@ -2040,10 +2084,7 @@ void bio_disassociate_task(struct bio *bio)
 		css_put(bio->bi_css);
 		bio->bi_css = NULL;
 	}
-	if (bio->bi_blkg) {
-		blkg_put(bio->bi_blkg);
-		bio->bi_blkg = NULL;
-	}
+	bio_disassociate_blkg(bio);
 }
 
 /**
@@ -2055,6 +2096,9 @@ void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
 {
 	if (src->bi_css)
 		WARN_ON(bio_associate_blkcg(dst, src->bi_css));
+
+	if (src->bi_blkg)
+		__bio_associate_blkg(dst, src->bi_blkg);
 }
 EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
 #endif /* CONFIG_BLK_CGROUP */
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 46e86c34cf79..cdbd10564e66 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -472,21 +472,15 @@ static void check_scale_change(struct iolatency_grp *iolat)
 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 {
 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-	struct blkcg *blkcg;
 	struct blkcg_gq *blkg;
-	struct request_queue *q = rqos->q;
 	bool issue_as_root = bio_issue_as_root_blkg(bio);
 
 	if (!blk_iolatency_enabled(blkiolat))
 		return;
 
-	rcu_read_lock();
-	bio_associate_blkcg(bio, NULL);
-	blkcg = bio_blkcg(bio);
-	blkg = blkg_lookup_create(blkcg, q);
+	bio_associate_blkg(bio);
+	blkg = bio->bi_blkg;
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-	bio_associate_blkg(bio, blkg);
-	rcu_read_unlock();
 
 	while (blkg && blkg->parent) {
 		struct iolatency_grp *iolat = blkg_to_lat(blkg);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d648d6720f46..228c3a007ebc 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2115,10 +2115,10 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
-static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio)
+static void blk_throtl_assoc_bio(struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	bio_associate_blkg(bio, tg_to_blkg(tg));
+	bio_associate_blkg(bio);
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
@@ -2143,7 +2143,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	throtl_update_latency_buckets(td);
 
-	blk_throtl_assoc_bio(tg, bio);
+	blk_throtl_assoc_bio(bio);
 	blk_throtl_update_idletime(tg);
 
 	sq = &tg->service_queue;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 056fb627edb3..62715a5a4f32 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -511,12 +511,15 @@ static inline int bio_associate_blkcg_from_page(struct bio *bio,
 
 #ifdef CONFIG_BLK_CGROUP
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
-int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg);
+void bio_disassociate_blkg(struct bio *bio);
+void bio_associate_blkg(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
+static inline void bio_disassociate_blkg(struct bio *bio) { }
+static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }

From 892ad71f622bbf39c6de321d5ca9b0fdec237c24 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:30 -0500
Subject: [PATCH 201/351] dm: set the static flush bio device on demand

The next patch changes the macro bio_set_dev() to associate a bio with a
blkg based on the device set. However, dm creates a static bio to be
used as the basis for cloning empty flush bios on creation. The
bio_set_dev() call in alloc_dev() will cause problems with the next
patch adding association to bio_set_dev() because the call is before the
bdev is associated with a gendisk (bd_disk is %NULL). To get around
this, set the device on the static bio every time and use that to clone
to the other bios.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Cc: Alasdair Kergon <agk@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c     |  1 +
 drivers/md/dm.c | 12 +++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index 452b8e79b998..41ebb3f8e2fc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2021,6 +2021,7 @@ void bio_disassociate_blkg(struct bio *bio)
 		bio->bi_blkg = NULL;
 	}
 }
+EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
 
 /**
  * __bio_associate_blkg - associate a bio with the a blkg
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a733e4c920af..ab72d79775ee 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1417,10 +1417,21 @@ static int __send_empty_flush(struct clone_info *ci)
 	unsigned target_nr = 0;
 	struct dm_target *ti;
 
+	/*
+	 * Empty flush uses a statically initialized bio, &md->flush_bio, as
+	 * the base for cloning.  However, blkg association requires that a
+	 * bdev is associated with a gendisk, which doesn't happen until the
+	 * bdev is opened.  So, blkg association is done at issue time of the
+	 * flush rather than when the device is created in alloc_dev().
+	 */
+	bio_set_dev(ci->bio, ci->io->md->bdev);
+
 	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
 		__send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
 
+	bio_disassociate_blkg(ci->bio);
+
 	return 0;
 }
 
@@ -1939,7 +1950,6 @@ static struct mapped_device *alloc_dev(int minor)
 		goto bad;
 
 	bio_init(&md->flush_bio, NULL, 0);
-	bio_set_dev(&md->flush_bio, md->bdev);
 	md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
 
 	dm_stats_init(&md->stats);

From 5cdf2e3fea5ee37b66842d76a9b06e6dac0b933d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:31 -0500
Subject: [PATCH 202/351] blkcg: associate blkg when associating a device

Previously, blkg association was handled by controller specific code in
blk-throttle and blk-iolatency. However, because a blkg represents a
relationship between a blkcg and a request_queue, it makes sense to keep
the blkg->q and bio->bi_disk->queue consistent.

This patch moves association into the bio_set_dev macro(). This should
cover the majority of cases where the device is set/changed keeping the
two pointers consistent. Fallback code is added to
blkcg_bio_issue_check() to catch any missing paths.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                |  1 +
 block/blk-iolatency.c      |  4 +---
 block/blk-throttle.c       |  1 -
 include/linux/bio.h        |  2 ++
 include/linux/blk-cgroup.h | 18 ++++++++++--------
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 41ebb3f8e2fc..1e852ab904aa 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2074,6 +2074,7 @@ void bio_associate_blkg(struct bio *bio)
 
 	rcu_read_unlock();
 }
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
 
 /**
  * bio_disassociate_task - undo bio_associate_current()
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index cdbd10564e66..e6b47c255521 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -472,14 +472,12 @@ static void check_scale_change(struct iolatency_grp *iolat)
 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 {
 	struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
-	struct blkcg_gq *blkg;
+	struct blkcg_gq *blkg = bio->bi_blkg;
 	bool issue_as_root = bio_issue_as_root_blkg(bio);
 
 	if (!blk_iolatency_enabled(blkiolat))
 		return;
 
-	bio_associate_blkg(bio);
-	blkg = bio->bi_blkg;
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 
 	while (blkg && blkg->parent) {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 228c3a007ebc..1c6529df2002 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2118,7 +2118,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 static void blk_throtl_assoc_bio(struct bio *bio)
 {
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	bio_associate_blkg(bio);
 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 #endif
 }
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 62715a5a4f32..6ee2ea8b378a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -491,12 +491,14 @@ do {						\
 		bio_clear_flag(bio, BIO_THROTTLED);\
 	(bio)->bi_disk = (bdev)->bd_disk;	\
 	(bio)->bi_partno = (bdev)->bd_partno;	\
+	bio_associate_blkg(bio);		\
 } while (0)
 
 #define bio_copy_dev(dst, src)			\
 do {						\
 	(dst)->bi_disk = (src)->bi_disk;	\
 	(dst)->bi_partno = (src)->bi_partno;	\
+	bio_clone_blkcg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index c08e96e521ed..f09752968c2a 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 #include <linux/kthread.h>
+#include <linux/fs.h>
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
@@ -802,21 +803,23 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio)
 {
-	struct blkcg *blkcg;
 	struct blkcg_gq *blkg;
 	bool throtl = false;
 
-	rcu_read_lock();
+	if (!bio->bi_blkg) {
+		char b[BDEVNAME_SIZE];
 
-	/* associate blkcg if bio hasn't attached one */
-	bio_associate_blkcg(bio, NULL);
-	blkcg = bio_blkcg(bio);
-	blkg = blkg_lookup_create(blkcg, q);
+		WARN_ONCE(1,
+			  "no blkg associated for bio on block-device: %s\n",
+			  bio_devname(bio, b));
+		bio_associate_blkg(bio);
+	}
+
+	blkg = bio->bi_blkg;
 
 	throtl = blk_throtl_bio(q, blkg, bio);
 
 	if (!throtl) {
-		blkg = blkg ?: q->root_blkg;
 		/*
 		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 		 * is a split bio and we would have already accounted for the
@@ -828,7 +831,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
-	rcu_read_unlock();
 	return !throtl;
 }
 

From e439bedf6b24264f620cc05627e23a90054bde41 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:32 -0500
Subject: [PATCH 203/351] blkcg: consolidate bio_issue_init() to be a part of
 core

bio_issue_init among other things initializes the timestamp for an IO.
Rather than have this logic handled by policies, this consolidates it to
be on the init paths (normal, clone, bounce clone).

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Liu Bo <bo.liu@linux.alibaba.com>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 1 +
 block/blk-iolatency.c      | 2 --
 block/blk-throttle.c       | 8 --------
 block/bounce.c             | 1 +
 include/linux/blk-cgroup.h | 9 +++++++++
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 1e852ab904aa..90089124b512 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -611,6 +611,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_io_vec = bio_src->bi_io_vec;
 
 	bio_clone_blkcg_association(bio, bio_src);
+	blkcg_bio_issue_init(bio);
 }
 EXPORT_SYMBOL(__bio_clone_fast);
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index e6b47c255521..5a79f06a730d 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -478,8 +478,6 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
 	if (!blk_iolatency_enabled(blkiolat))
 		return;
 
-	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-
 	while (blkg && blkg->parent) {
 		struct iolatency_grp *iolat = blkg_to_lat(blkg);
 		if (!iolat) {
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 1c6529df2002..1b97a73d2fb1 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -2115,13 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td)
 }
 #endif
 
-static void blk_throtl_assoc_bio(struct bio *bio)
-{
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
-#endif
-}
-
 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 		    struct bio *bio)
 {
@@ -2142,7 +2135,6 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 
 	throtl_update_latency_buckets(td);
 
-	blk_throtl_assoc_bio(bio);
 	blk_throtl_update_idletime(tg);
 
 	sq = &tg->service_queue;
diff --git a/block/bounce.c b/block/bounce.c
index 559c55bda040..cfb96d5170d0 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -278,6 +278,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 	}
 
 	bio_clone_blkcg_association(bio, bio_src);
+	blkcg_bio_issue_init(bio);
 
 	return bio;
 }
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f09752968c2a..8b069c3775ee 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -800,6 +800,12 @@ static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg
 				  struct bio *bio) { return false; }
 #endif
 
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio)
 {
@@ -831,6 +837,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 	}
 
+	blkcg_bio_issue_init(bio);
+
 	return !throtl;
 }
 
@@ -936,6 +944,7 @@ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 static inline void blkg_get(struct blkcg_gq *blkg) { }
 static inline void blkg_put(struct blkcg_gq *blkg) { }
 
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 					 struct bio *bio) { return true; }
 

From 6a7f6d86a561473032287c8e4583eac5853c6efa Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:33 -0500
Subject: [PATCH 204/351] blkcg: associate a blkg for pages being evicted by
 swap

A prior patch in this series added blkg association to bios issued by
cgroups. There are two other paths that we want to attribute work back
to the appropriate cgroup: swap and writeback. Here we modify the way
swap tags bios to include the blkg. Writeback will be tackle in the next
patch.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 62 +++++++++++++++++++++++++++------------------
 include/linux/bio.h |  6 ++---
 mm/page_io.c        |  2 +-
 3 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 90089124b512..f0f069c1823c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1957,30 +1957,6 @@ EXPORT_SYMBOL(bioset_init_from_src);
 
 #ifdef CONFIG_BLK_CGROUP
 
-#ifdef CONFIG_MEMCG
-/**
- * bio_associate_blkcg_from_page - associate a bio with the page's blkcg
- * @bio: target bio
- * @page: the page to lookup the blkcg from
- *
- * Associate @bio with the blkcg from @page's owning memcg.  This works like
- * every other associate function wrt references.
- */
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page)
-{
-	struct cgroup_subsys_state *blkcg_css;
-
-	if (unlikely(bio->bi_css))
-		return -EBUSY;
-	if (!page->mem_cgroup)
-		return 0;
-	blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup,
-				     &io_cgrp_subsys);
-	bio->bi_css = blkcg_css;
-	return 0;
-}
-#endif /* CONFIG_MEMCG */
-
 /**
  * bio_associate_blkcg - associate a bio with the specified blkcg
  * @bio: target bio
@@ -2045,6 +2021,44 @@ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 	bio->bi_blkg = blkg_try_get_closest(blkg);
 }
 
+static void __bio_associate_blkg_from_css(struct bio *bio,
+					  struct cgroup_subsys_state *css)
+{
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
+	__bio_associate_blkg(bio, blkg);
+
+	rcu_read_unlock();
+}
+
+#ifdef CONFIG_MEMCG
+/**
+ * bio_associate_blkg_from_page - associate a bio with the page's blkg
+ * @bio: target bio
+ * @page: the page to lookup the blkcg from
+ *
+ * Associate @bio with the blkg from @page's owning memcg and the respective
+ * request_queue.  This works like every other associate function wrt
+ * references.
+ */
+void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+{
+	struct cgroup_subsys_state *css;
+
+	if (unlikely(bio->bi_css))
+		return;
+	if (!page->mem_cgroup)
+		return;
+
+	css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+	bio->bi_css = css;
+	__bio_associate_blkg_from_css(bio, css);
+}
+#endif /* CONFIG_MEMCG */
+
 /**
  * bio_associate_blkg - associate a bio with a blkg
  * @bio: target bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6ee2ea8b378a..f13572c254a7 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -505,10 +505,10 @@ do {						\
 	disk_devt((bio)->bi_disk)
 
 #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-int bio_associate_blkcg_from_page(struct bio *bio, struct page *page);
+void bio_associate_blkg_from_page(struct bio *bio, struct page *page);
 #else
-static inline int bio_associate_blkcg_from_page(struct bio *bio,
-						struct page *page) {  return 0; }
+static inline void bio_associate_blkg_from_page(struct bio *bio,
+						struct page *page) { }
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
diff --git a/mm/page_io.c b/mm/page_io.c
index 5bdfd21c1bd9..3475733b1926 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -339,7 +339,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 		goto out;
 	}
 	bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
-	bio_associate_blkcg_from_page(bio, page);
+	bio_associate_blkg_from_page(bio, page);
 	count_swpout_vm_event(page);
 	set_page_writeback(page);
 	unlock_page(page);

From fd42df305f804ddc0d5ac028e944784283b2f92d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:34 -0500
Subject: [PATCH 205/351] blkcg: associate writeback bios with a blkg

One of the goals of this series is to remove a separate reference to
the css of the bio. This can and should be accessed via bio_blkcg(). In
this patch, wbc_init_bio() now requires a bio to have a device
associated with it.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/admin-guide/cgroup-v2.rst |  8 +++++---
 block/bio.c                             | 18 ++++++++++++++++++
 fs/buffer.c                             | 10 +++++-----
 fs/ext4/page-io.c                       |  2 +-
 include/linux/bio.h                     |  5 +++++
 include/linux/writeback.h               |  5 +++--
 6 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 476722b7b636..baf19bf28385 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1879,8 +1879,10 @@ following two functions.
 
   wbc_init_bio(@wbc, @bio)
 	Should be called for each bio carrying writeback data and
-	associates the bio with the inode's owner cgroup.  Can be
-	called anytime between bio allocation and submission.
+	associates the bio with the inode's owner cgroup and the
+	corresponding request queue.  This must be called after
+	a queue (device) has been associated with the bio and
+	before submission.
 
   wbc_account_io(@wbc, @page, @bytes)
 	Should be called for each data segment being written out.
@@ -1899,7 +1901,7 @@ the configuration, the bio may be executed at a lower priority and if
 the writeback session is holding shared resources, e.g. a journal
 entry, may lead to priority inversion.  There is no one easy solution
 for the problem.  Filesystems can try to work around specific problem
-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+cases by skipping wbc_init_bio() and using bio_associate_blkg()
 directly.
 
 
diff --git a/block/bio.c b/block/bio.c
index f0f069c1823c..b42477b6a225 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -2034,6 +2034,24 @@ static void __bio_associate_blkg_from_css(struct bio *bio,
 	rcu_read_unlock();
 }
 
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio.  This takes a reference on the css that will
+ * be put upon freeing of @bio.
+ */
+void bio_associate_blkg_from_css(struct bio *bio,
+				 struct cgroup_subsys_state *css)
+{
+	css_get(css);
+	bio->bi_css = css;
+	__bio_associate_blkg_from_css(bio, css);
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
 #ifdef CONFIG_MEMCG
 /**
  * bio_associate_blkg_from_page - associate a bio with the page's blkg
diff --git a/fs/buffer.c b/fs/buffer.c
index 1286c2b95498..d60d61e8ed7d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 	 */
 	bio = bio_alloc(GFP_NOIO, 1);
 
-	if (wbc) {
-		wbc_init_bio(wbc, bio);
-		wbc_account_io(wbc, bh->b_page, bh->b_size);
-	}
-
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_write_hint = write_hint;
@@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
 		op_flags |= REQ_PRIO;
 	bio_set_op_attrs(bio, op, op_flags);
 
+	if (wbc) {
+		wbc_init_bio(wbc, bio);
+		wbc_account_io(wbc, bh->b_page, bh->b_size);
+	}
+
 	submit_bio(bio);
 	return 0;
 }
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db7590178dfc..2aa62d58d8dd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
 	bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
 	if (!bio)
 		return -ENOMEM;
-	wbc_init_bio(io->io_wbc, bio);
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 	bio_set_dev(bio, bh->b_bdev);
 	bio->bi_end_io = ext4_end_bio;
 	bio->bi_private = ext4_get_io_end(io->io_end);
 	io->io_bio = bio;
 	io->io_next_block = bh->b_blocknr;
+	wbc_init_bio(io->io_wbc, bio);
 	return 0;
 }
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f13572c254a7..f0438061a5a3 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -515,6 +515,8 @@ static inline void bio_associate_blkg_from_page(struct bio *bio,
 int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
+void bio_associate_blkg_from_css(struct bio *bio,
+				 struct cgroup_subsys_state *css);
 void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
@@ -522,6 +524,9 @@ static inline int bio_associate_blkcg(struct bio *bio,
 			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline void bio_disassociate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg(struct bio *bio) { }
+static inline void bio_associate_blkg_from_css(struct bio *bio,
+					       struct cgroup_subsys_state *css)
+{ }
 static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkcg_association(struct bio *dst,
 			struct bio *src) { }
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fdfd04e348f6..738a0c24874f 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -246,7 +246,8 @@ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
  *
  * @bio is a part of the writeback in progress controlled by @wbc.  Perform
  * writeback specific initialization.  This is used to apply the cgroup
- * writeback context.
+ * writeback context.  Must be called after the bio has been associated with
+ * a device.
  */
 static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 {
@@ -257,7 +258,7 @@ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
 	 * regular writeback instead of writing things out itself.
 	 */
 	if (wbc->wb)
-		bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+		bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
 }
 
 #else	/* CONFIG_CGROUP_WRITEBACK */

From db6638d7d177a8bc74c9e539e2e0d7d061c767b1 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:35 -0500
Subject: [PATCH 206/351] blkcg: remove bio->bi_css and instead use
 bio->bi_blkg

Prior patches ensured that any bio that interacts with a request_queue
is properly associated with a blkg. This makes bio->bi_css unnecessary
as blkg maintains a reference to blkcg already.

This removes the bio field bi_css and transfers corresponding uses to
access via bi_blkg.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 59 +++++++++-----------------------------
 block/bounce.c             |  2 +-
 drivers/block/loop.c       |  5 ++--
 drivers/md/raid0.c         |  2 +-
 include/linux/bio.h        | 11 +++----
 include/linux/blk-cgroup.h |  8 +++---
 include/linux/blk_types.h  |  7 +++--
 kernel/trace/blktrace.c    |  4 +--
 8 files changed, 32 insertions(+), 66 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index b42477b6a225..2b6bc7b805ec 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -610,7 +610,7 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
 	bio->bi_iter = bio_src->bi_iter;
 	bio->bi_io_vec = bio_src->bi_io_vec;
 
-	bio_clone_blkcg_association(bio, bio_src);
+	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
 }
 EXPORT_SYMBOL(__bio_clone_fast);
@@ -1957,34 +1957,6 @@ EXPORT_SYMBOL(bioset_init_from_src);
 
 #ifdef CONFIG_BLK_CGROUP
 
-/**
- * bio_associate_blkcg - associate a bio with the specified blkcg
- * @bio: target bio
- * @blkcg_css: css of the blkcg to associate
- *
- * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
- * treat @bio as if it were issued by a task which belongs to the blkcg.
- *
- * This function takes an extra reference of @blkcg_css which will be put
- * when @bio is released.  The caller must own @bio and is responsible for
- * synchronizing calls to this function.  If @blkcg_css is %NULL, a call to
- * blkcg_get_css() finds the current css from the kthread or task.
- */
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
-{
-	if (unlikely(bio->bi_css))
-		return -EBUSY;
-
-	if (blkcg_css)
-		css_get(blkcg_css);
-	else
-		blkcg_css = blkcg_get_css();
-
-	bio->bi_css = blkcg_css;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkcg);
-
 /**
  * bio_disassociate_blkg - puts back the blkg reference if associated
  * @bio: target bio
@@ -1994,6 +1966,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkcg);
 void bio_disassociate_blkg(struct bio *bio)
 {
 	if (bio->bi_blkg) {
+		/* a ref is always taken on css */
+		css_put(&bio_blkcg(bio)->css);
 		blkg_put(bio->bi_blkg);
 		bio->bi_blkg = NULL;
 	}
@@ -2047,7 +2021,6 @@ void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css)
 {
 	css_get(css);
-	bio->bi_css = css;
 	__bio_associate_blkg_from_css(bio, css);
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
@@ -2066,13 +2039,10 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
 	struct cgroup_subsys_state *css;
 
-	if (unlikely(bio->bi_css))
-		return;
 	if (!page->mem_cgroup)
 		return;
 
 	css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
-	bio->bi_css = css;
 	__bio_associate_blkg_from_css(bio, css);
 }
 #endif /* CONFIG_MEMCG */
@@ -2094,8 +2064,10 @@ void bio_associate_blkg(struct bio *bio)
 
 	rcu_read_lock();
 
-	bio_associate_blkcg(bio, NULL);
-	blkcg = bio_blkcg(bio);
+	if (bio->bi_blkg)
+		blkcg = bio->bi_blkg->blkcg;
+	else
+		blkcg = css_to_blkcg(blkcg_get_css());
 
 	if (!blkcg->css.parent) {
 		__bio_associate_blkg(bio, q->root_blkg);
@@ -2115,27 +2087,22 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg);
  */
 void bio_disassociate_task(struct bio *bio)
 {
-	if (bio->bi_css) {
-		css_put(bio->bi_css);
-		bio->bi_css = NULL;
-	}
 	bio_disassociate_blkg(bio);
 }
 
 /**
- * bio_clone_blkcg_association - clone blkcg association from src to dst bio
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
  * @dst: destination bio
  * @src: source bio
  */
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src)
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
 {
-	if (src->bi_css)
-		WARN_ON(bio_associate_blkcg(dst, src->bi_css));
-
-	if (src->bi_blkg)
+	if (src->bi_blkg) {
+		css_get(&bio_blkcg(src)->css);
 		__bio_associate_blkg(dst, src->bi_blkg);
+	}
 }
-EXPORT_SYMBOL_GPL(bio_clone_blkcg_association);
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
 #endif /* CONFIG_BLK_CGROUP */
 
 static void __init biovec_init_slabs(void)
diff --git a/block/bounce.c b/block/bounce.c
index cfb96d5170d0..ffb9e9ecfa7e 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -277,7 +277,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 		}
 	}
 
-	bio_clone_blkcg_association(bio, bio_src);
+	bio_clone_blkg_association(bio, bio_src);
 	blkcg_bio_issue_init(bio);
 
 	return bio;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 176ab1f28eca..0770004616de 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,6 +77,7 @@
 #include <linux/falloc.h>
 #include <linux/uio.h>
 #include <linux/ioprio.h>
+#include <linux/blk-cgroup.h>
 
 #include "loop.h"
 
@@ -1820,8 +1821,8 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	/* always use the first bio's css */
 #ifdef CONFIG_BLK_CGROUP
-	if (cmd->use_aio && rq->bio && rq->bio->bi_css) {
-		cmd->css = rq->bio->bi_css;
+	if (cmd->use_aio && rq->bio && rq->bio->bi_blkg) {
+		cmd->css = &bio_blkcg(rq->bio)->css;
 		css_get(cmd->css);
 	} else
 #endif
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index ac1cffd2a09b..f3fb5bb8c82a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -542,7 +542,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
 		    !discard_bio)
 			continue;
 		bio_chain(discard_bio, bio);
-		bio_clone_blkcg_association(discard_bio, bio);
+		bio_clone_blkg_association(discard_bio, bio);
 		if (mddev->gendisk)
 			trace_block_bio_remap(bdev_get_queue(rdev->bdev),
 				discard_bio, disk_devt(mddev->gendisk),
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f0438061a5a3..84e1c4dc703a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -498,7 +498,7 @@ do {						\
 do {						\
 	(dst)->bi_disk = (src)->bi_disk;	\
 	(dst)->bi_partno = (src)->bi_partno;	\
-	bio_clone_blkcg_association(dst, src);	\
+	bio_clone_blkg_association(dst, src);	\
 } while (0)
 
 #define bio_dev(bio) \
@@ -512,24 +512,21 @@ static inline void bio_associate_blkg_from_page(struct bio *bio,
 #endif
 
 #ifdef CONFIG_BLK_CGROUP
-int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
 void bio_disassociate_task(struct bio *bio);
-void bio_clone_blkcg_association(struct bio *dst, struct bio *src);
+void bio_clone_blkg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
-static inline int bio_associate_blkcg(struct bio *bio,
-			struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline void bio_disassociate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
 					       struct cgroup_subsys_state *css)
 { }
 static inline void bio_disassociate_task(struct bio *bio) { }
-static inline void bio_clone_blkcg_association(struct bio *dst,
-			struct bio *src) { }
+static inline void bio_clone_blkg_association(struct bio *dst,
+					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
 #ifdef CONFIG_HIGHMEM
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 8b069c3775ee..f11c37f8ce09 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -309,8 +309,8 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
  */
 static inline struct blkcg *__bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return css_to_blkcg(blkcg_css());
 }
 
@@ -324,8 +324,8 @@ static inline struct blkcg *__bio_blkcg(struct bio *bio)
  */
 static inline struct blkcg *bio_blkcg(struct bio *bio)
 {
-	if (bio && bio->bi_css)
-		return css_to_blkcg(bio->bi_css);
+	if (bio && bio->bi_blkg)
+		return bio->bi_blkg->blkcg;
 	return NULL;
 }
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c0ba1a038ff3..46c005d601ac 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -174,10 +174,11 @@ struct bio {
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
 	/*
-	 * Optional css associated with this bio.  Put on bio
-	 * release.  Read comment on top of bio_associate_current().
+	 * Represents the association of the css and request_queue for the bio.
+	 * If a bio goes direct to device, it will not have a blkg as it will
+	 * not have a request_queue associated with it.  The reference is put
+	 * on release of the bio.
 	 */
-	struct cgroup_subsys_state *bi_css;
 	struct blkcg_gq		*bi_blkg;
 	struct bio_issue	bi_issue;
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2868d85f1fb1..fac0ddf8a8e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -764,9 +764,9 @@ blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
 	if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
 		return NULL;
 
-	if (!bio->bi_css)
+	if (!bio->bi_blkg)
 		return NULL;
-	return cgroup_get_kernfs_id(bio->bi_css->cgroup);
+	return cgroup_get_kernfs_id(bio_blkcg(bio)->css.cgroup);
 }
 #else
 static union kernfs_node_id *

From fc5a828bfad628c1092194f2814604943561c52d Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:36 -0500
Subject: [PATCH 207/351] blkcg: remove additional reference to the css

The previous patch in this series removed carrying around a pointer to
the css in blkg. However, the blkg association logic still relied on
taking a reference on the css to ensure we wouldn't fail in getting a
reference for the blkg.

Here the implicit dependency on the css is removed. The association
continues to rely on the tryget logic walking up the blkg tree. This
streamlines the three ways that association can happen: normal, swap,
and writeback.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                | 66 ++++++++++++++++----------------------
 include/linux/blk-cgroup.h | 41 -----------------------
 include/linux/cgroup.h     |  2 ++
 kernel/cgroup/cgroup.c     | 48 +++++++++++++++++++++------
 4 files changed, 69 insertions(+), 88 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2b6bc7b805ec..ce1e512dca5a 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1966,8 +1966,6 @@ EXPORT_SYMBOL(bioset_init_from_src);
 void bio_disassociate_blkg(struct bio *bio)
 {
 	if (bio->bi_blkg) {
-		/* a ref is always taken on css */
-		css_put(&bio_blkcg(bio)->css);
 		blkg_put(bio->bi_blkg);
 		bio->bi_blkg = NULL;
 	}
@@ -1995,33 +1993,31 @@ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 	bio->bi_blkg = blkg_try_get_closest(blkg);
 }
 
-static void __bio_associate_blkg_from_css(struct bio *bio,
-					  struct cgroup_subsys_state *css)
-{
-	struct blkcg_gq *blkg;
-
-	rcu_read_lock();
-
-	blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue);
-	__bio_associate_blkg(bio, blkg);
-
-	rcu_read_unlock();
-}
-
 /**
  * bio_associate_blkg_from_css - associate a bio with a specified css
  * @bio: target bio
  * @css: target css
  *
  * Associate @bio with the blkg found by combining the css's blkg and the
- * request_queue of the @bio.  This takes a reference on the css that will
- * be put upon freeing of @bio.
+ * request_queue of the @bio.  This falls back to the queue's root_blkg if
+ * the association fails with the css.
  */
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css)
 {
-	css_get(css);
-	__bio_associate_blkg_from_css(bio, css);
+	struct request_queue *q = bio->bi_disk->queue;
+	struct blkcg_gq *blkg;
+
+	rcu_read_lock();
+
+	if (!css || !css->parent)
+		blkg = q->root_blkg;
+	else
+		blkg = blkg_lookup_create(css_to_blkcg(css), q);
+
+	__bio_associate_blkg(bio, blkg);
+
+	rcu_read_unlock();
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
 
@@ -2032,8 +2028,8 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
  * @page: the page to lookup the blkcg from
  *
  * Associate @bio with the blkg from @page's owning memcg and the respective
- * request_queue.  This works like every other associate function wrt
- * references.
+ * request_queue.  If cgroup_e_css returns %NULL, fall back to the queue's
+ * root_blkg.
  */
 void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 {
@@ -2042,8 +2038,12 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
 	if (!page->mem_cgroup)
 		return;
 
-	css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
-	__bio_associate_blkg_from_css(bio, css);
+	rcu_read_lock();
+
+	css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+	bio_associate_blkg_from_css(bio, css);
+
+	rcu_read_unlock();
 }
 #endif /* CONFIG_MEMCG */
 
@@ -2058,24 +2058,16 @@ void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
  */
 void bio_associate_blkg(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_disk->queue;
-	struct blkcg *blkcg;
-	struct blkcg_gq *blkg;
+	struct cgroup_subsys_state *css;
 
 	rcu_read_lock();
 
 	if (bio->bi_blkg)
-		blkcg = bio->bi_blkg->blkcg;
+		css = &bio_blkcg(bio)->css;
 	else
-		blkcg = css_to_blkcg(blkcg_get_css());
+		css = blkcg_css();
 
-	if (!blkcg->css.parent) {
-		__bio_associate_blkg(bio, q->root_blkg);
-	} else {
-		blkg = blkg_lookup_create(blkcg, q);
-
-		__bio_associate_blkg(bio, blkg);
-	}
+	bio_associate_blkg_from_css(bio, css);
 
 	rcu_read_unlock();
 }
@@ -2097,10 +2089,8 @@ void bio_disassociate_task(struct bio *bio)
  */
 void bio_clone_blkg_association(struct bio *dst, struct bio *src)
 {
-	if (src->bi_blkg) {
-		css_get(&bio_blkcg(src)->css);
+	if (src->bi_blkg)
 		__bio_associate_blkg(dst, src->bi_blkg);
-	}
 }
 EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
 #endif /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index f11c37f8ce09..284819a4d122 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -247,47 +247,6 @@ static inline struct cgroup_subsys_state *blkcg_css(void)
 	return task_css(current, io_cgrp_id);
 }
 
-/**
- * blkcg_get_css - find and get a reference to the css
- *
- * Find the css associated with either the kthread or the current task.
- * This takes a reference on the blkcg which will need to be managed by the
- * caller.
- */
-static inline struct cgroup_subsys_state *blkcg_get_css(void)
-{
-	struct cgroup_subsys_state *css;
-
-	rcu_read_lock();
-
-	css = kthread_blkcg();
-	if (css) {
-		css_get(css);
-	} else {
-		/*
-		 * This is a bit complicated.  It is possible task_css() is
-		 * seeing an old css pointer here.  This is caused by the
-		 * current thread migrating away from this cgroup and this
-		 * cgroup dying.  css_tryget() will fail when trying to take a
-		 * ref on a cgroup that's ref count has hit 0.
-		 *
-		 * Therefore, if it does fail, this means current must have
-		 * been swapped away already and this is waiting for it to
-		 * propagate on the polling cpu.  Hence the use of cpu_relax().
-		 */
-		while (true) {
-			css = task_css(current, io_cgrp_id);
-			if (likely(css_tryget(css)))
-				break;
-			cpu_relax();
-		}
-	}
-
-	rcu_read_unlock();
-
-	return css;
-}
-
 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 {
 	return css ? container_of(css, struct blkcg, css) : NULL;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9d12757a65b0..9968332cceed 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -93,6 +93,8 @@ extern struct css_set init_css_set;
 
 bool css_has_online_children(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
+					 struct cgroup_subsys *ss);
 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
 					     struct cgroup_subsys *ss);
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 6aaf5dd5383b..8b79318810ad 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -493,7 +493,7 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
 }
 
 /**
- * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
  *
@@ -502,8 +502,8 @@ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
  * function is guaranteed to return non-NULL css.
  */
-static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
-						struct cgroup_subsys *ss)
+static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
+							struct cgroup_subsys *ss)
 {
 	lockdep_assert_held(&cgroup_mutex);
 
@@ -523,6 +523,35 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 	return cgroup_css(cgrp, ss);
 }
 
+/**
+ * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get the effective css of @cgrp for @ss.  The effective css is
+ * defined as the matching css of the nearest ancestor including self which
+ * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
+ * the root css is returned, so this function always returns a valid css.
+ *
+ * The returned css is not guaranteed to be online, and therefore it is the
+ * callers responsiblity to tryget a reference for it.
+ */
+struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
+					 struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	do {
+		css = cgroup_css(cgrp, ss);
+
+		if (css)
+			return css;
+		cgrp = cgroup_parent(cgrp);
+	} while (cgrp);
+
+	return init_css_set.subsys[ss->id];
+}
+
 /**
  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
@@ -605,10 +634,11 @@ EXPORT_SYMBOL_GPL(of_css);
  *
  * Should be called under cgroup_[tree_]mutex.
  */
-#define for_each_e_css(css, ssid, cgrp)					\
-	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	\
-		if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
-			;						\
+#define for_each_e_css(css, ssid, cgrp)					    \
+	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)	    \
+		if (!((css) = cgroup_e_css_by_mask(cgrp,		    \
+						   cgroup_subsys[(ssid)]))) \
+			;						    \
 		else
 
 /**
@@ -1007,7 +1037,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
 			 * @ss is in this hierarchy, so we want the
 			 * effective css from @cgrp.
 			 */
-			template[i] = cgroup_e_css(cgrp, ss);
+			template[i] = cgroup_e_css_by_mask(cgrp, ss);
 		} else {
 			/*
 			 * @ss is not in this hierarchy, so we don't want
@@ -3024,7 +3054,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
 		return ret;
 
 	/*
-	 * At this point, cgroup_e_css() results reflect the new csses
+	 * At this point, cgroup_e_css_by_mask() results reflect the new csses
 	 * making the following cgroup_update_dfl_csses() properly update
 	 * css associations of all tasks in the subtree.
 	 */

From 6f70fb66182b02e50deea65e9a3a86b7bf659a39 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:37 -0500
Subject: [PATCH 208/351] blkcg: remove bio_disassociate_task()

Now that a bio only holds a blkg reference, so clean up is simply
putting back that reference. Remove bio_disassociate_task() as it just
calls bio_disassociate_blkg() and call the latter directly.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c         | 11 +----------
 include/linux/bio.h |  2 --
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index ce1e512dca5a..7ec5316e6ecc 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -244,7 +244,7 @@ fallback:
 
 void bio_uninit(struct bio *bio)
 {
-	bio_disassociate_task(bio);
+	bio_disassociate_blkg(bio);
 }
 EXPORT_SYMBOL(bio_uninit);
 
@@ -2073,15 +2073,6 @@ void bio_associate_blkg(struct bio *bio)
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg);
 
-/**
- * bio_disassociate_task - undo bio_associate_current()
- * @bio: target bio
- */
-void bio_disassociate_task(struct bio *bio)
-{
-	bio_disassociate_blkg(bio);
-}
-
 /**
  * bio_clone_blkg_association - clone blkg association from src to dst bio
  * @dst: destination bio
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 84e1c4dc703a..7380b094dcca 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -516,7 +516,6 @@ void bio_disassociate_blkg(struct bio *bio);
 void bio_associate_blkg(struct bio *bio);
 void bio_associate_blkg_from_css(struct bio *bio,
 				 struct cgroup_subsys_state *css);
-void bio_disassociate_task(struct bio *bio);
 void bio_clone_blkg_association(struct bio *dst, struct bio *src);
 #else	/* CONFIG_BLK_CGROUP */
 static inline void bio_disassociate_blkg(struct bio *bio) { }
@@ -524,7 +523,6 @@ static inline void bio_associate_blkg(struct bio *bio) { }
 static inline void bio_associate_blkg_from_css(struct bio *bio,
 					       struct cgroup_subsys_state *css)
 { }
-static inline void bio_disassociate_task(struct bio *bio) { }
 static inline void bio_clone_blkg_association(struct bio *dst,
 					      struct bio *src) { }
 #endif	/* CONFIG_BLK_CGROUP */

From 7fcf2b033b84e261dca283bc2911aaea4b07b525 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:38 -0500
Subject: [PATCH 209/351] blkcg: change blkg reference counting to use
 percpu_ref

Every bio is now associated with a blkg putting blkg_get, blkg_try_get,
and blkg_put on the hot path. Switch over the refcnt in blkg to use
percpu_ref.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 41 ++++++++++++++++++++++++++++++++++++--
 include/linux/blk-cgroup.h | 15 +++++---------
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 120f2e2835fb..2ca7611fe274 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -81,6 +81,37 @@ static void blkg_free(struct blkcg_gq *blkg)
 	kfree(blkg);
 }
 
+static void __blkg_release(struct rcu_head *rcu)
+{
+	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+
+	percpu_ref_exit(&blkg->refcnt);
+
+	/* release the blkcg and parent blkg refs this blkg has been holding */
+	css_put(&blkg->blkcg->css);
+	if (blkg->parent)
+		blkg_put(blkg->parent);
+
+	wb_congested_put(blkg->wb_congested);
+
+	blkg_free(blkg);
+}
+
+/*
+ * A group is RCU protected, but having an rcu lock does not mean that one
+ * can access all the fields of blkg and assume these are valid.  For
+ * example, don't try to follow throtl_data and request queue links.
+ *
+ * Having a reference to blkg under an rcu allows accesses to only values
+ * local to groups like group stats and group rate limits.
+ */
+static void blkg_release(struct percpu_ref *ref)
+{
+	struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);
+
+	call_rcu(&blkg->rcu_head, __blkg_release);
+}
+
 /**
  * blkg_alloc - allocate a blkg
  * @blkcg: block cgroup the new blkg is associated with
@@ -107,7 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
 	blkg->q = q;
 	INIT_LIST_HEAD(&blkg->q_node);
 	blkg->blkcg = blkcg;
-	atomic_set(&blkg->refcnt, 1);
 
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
@@ -207,6 +237,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 		blkg_get(blkg->parent);
 	}
 
+	ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
+			      GFP_NOWAIT | __GFP_NOWARN);
+	if (ret)
+		goto err_cancel_ref;
+
 	/* invoke per-policy init */
 	for (i = 0; i < BLKCG_MAX_POLS; i++) {
 		struct blkcg_policy *pol = blkcg_policy[i];
@@ -239,6 +274,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	blkg_put(blkg);
 	return ERR_PTR(ret);
 
+err_cancel_ref:
+	percpu_ref_exit(&blkg->refcnt);
 err_put_congested:
 	wb_congested_put(wb_congested);
 err_put_css:
@@ -367,7 +404,7 @@ static void blkg_destroy(struct blkcg_gq *blkg)
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
-	blkg_put(blkg);
+	percpu_ref_kill(&blkg->refcnt);
 }
 
 /**
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 284819a4d122..d19ef15a673d 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -124,7 +124,7 @@ struct blkcg_gq {
 	struct blkcg_gq			*parent;
 
 	/* reference count */
-	atomic_t			refcnt;
+	struct percpu_ref		refcnt;
 
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
@@ -487,8 +487,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
  */
 static inline void blkg_get(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	atomic_inc(&blkg->refcnt);
+	percpu_ref_get(&blkg->refcnt);
 }
 
 /**
@@ -500,7 +499,7 @@ static inline void blkg_get(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
 {
-	if (atomic_inc_not_zero(&blkg->refcnt))
+	if (percpu_ref_tryget(&blkg->refcnt))
 		return blkg;
 	return NULL;
 }
@@ -514,23 +513,19 @@ static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
 {
-	while (!atomic_inc_not_zero(&blkg->refcnt))
+	while (!percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;
 
 	return blkg;
 }
 
-void __blkg_release_rcu(struct rcu_head *rcu);
-
 /**
  * blkg_put - put a blkg reference
  * @blkg: blkg to put
  */
 static inline void blkg_put(struct blkcg_gq *blkg)
 {
-	WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-	if (atomic_dec_and_test(&blkg->refcnt))
-		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+	percpu_ref_put(&blkg->refcnt);
 }
 
 /**

From 7754f669ffde3919e398a9e591cd7510d6cf4e73 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Wed, 5 Dec 2018 12:10:39 -0500
Subject: [PATCH 210/351] blkcg: rename blkg_try_get() to blkg_tryget()

blkg reference counting now uses percpu_ref rather than atomic_t. Let's
make this consistent with css_tryget. This renames blkg_try_get to
blkg_tryget and now returns a bool rather than the blkg or %NULL.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c                |  2 +-
 block/blk-cgroup.c         |  3 +--
 block/blk-iolatency.c      |  2 +-
 include/linux/blk-cgroup.h | 12 +++++-------
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 7ec5316e6ecc..06760543ec81 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1990,7 +1990,7 @@ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
 {
 	bio_disassociate_blkg(bio);
 
-	bio->bi_blkg = blkg_try_get_closest(blkg);
+	bio->bi_blkg = blkg_tryget_closest(blkg);
 }
 
 /**
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2ca7611fe274..6bd0619a7d6e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1736,8 +1736,7 @@ void blkcg_maybe_throttle_current(void)
 	blkg = blkg_lookup(blkcg, q);
 	if (!blkg)
 		goto out;
-	blkg = blkg_try_get(blkg);
-	if (!blkg)
+	if (!blkg_tryget(blkg))
 		goto out;
 	rcu_read_unlock();
 
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 5a79f06a730d..0b14c3d57769 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -698,7 +698,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
 		 * We could be exiting, don't access the pd unless we have a
 		 * ref on the blkg.
 		 */
-		if (!blkg_try_get(blkg))
+		if (!blkg_tryget(blkg))
 			continue;
 
 		iolat = blkg_to_lat(blkg);
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index d19ef15a673d..752de1becb5c 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -491,27 +491,25 @@ static inline void blkg_get(struct blkcg_gq *blkg)
 }
 
 /**
- * blkg_try_get - try and get a blkg reference
+ * blkg_tryget - try and get a blkg reference
  * @blkg: blkg to get
  *
  * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
  * of freeing this blkg, so we can only use it if the refcnt is not zero.
  */
-static inline struct blkcg_gq *blkg_try_get(struct blkcg_gq *blkg)
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
 {
-	if (percpu_ref_tryget(&blkg->refcnt))
-		return blkg;
-	return NULL;
+	return percpu_ref_tryget(&blkg->refcnt);
 }
 
 /**
- * blkg_try_get_closest - try and get a blkg ref on the closet blkg
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
  * @blkg: blkg to get
  *
  * This walks up the blkg tree to find the closest non-dying blkg and returns
  * the blkg that it did association with as it may not be the passed in blkg.
  */
-static inline struct blkcg_gq *blkg_try_get_closest(struct blkcg_gq *blkg)
+static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
 {
 	while (!percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;

From 84f603246db9703aca05aad58b94a54dfbf44327 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 4 Dec 2018 12:59:02 -0500
Subject: [PATCH 211/351] block: add rq_qos_wait to rq_qos

Originally when I split out the common code from blk-wbt into rq_qos I
left the wbt_wait() where it was and simply copied and modified it
slightly to work for io-latency.  However they are both basically the
same thing, and as time has gone on wbt_wait() has ended up much smarter
and kinder than it was when I copied it into io-latency, which means
io-latency has lost out on these improvements.

Since they are the same thing essentially except for a few minor things,
create rq_qos_wait() that replicates what wbt_wait() currently does with
callbacks that can be passed in for the snowflakes to do their own thing
as appropriate.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-rq-qos.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-rq-qos.h |  6 ++++
 2 files changed, 92 insertions(+)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 80f603b76f61..e932ef9d2718 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -176,6 +176,92 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
 	rq_depth_calc_max_depth(rqd);
 }
 
+struct rq_qos_wait_data {
+	struct wait_queue_entry wq;
+	struct task_struct *task;
+	struct rq_wait *rqw;
+	acquire_inflight_cb_t *cb;
+	void *private_data;
+	bool got_token;
+};
+
+static int rq_qos_wake_function(struct wait_queue_entry *curr,
+				unsigned int mode, int wake_flags, void *key)
+{
+	struct rq_qos_wait_data *data = container_of(curr,
+						     struct rq_qos_wait_data,
+						     wq);
+
+	/*
+	 * If we fail to get a budget, return -1 to interrupt the wake up loop
+	 * in __wake_up_common.
+	 */
+	if (!data->cb(data->rqw, data->private_data))
+		return -1;
+
+	data->got_token = true;
+	list_del_init(&curr->entry);
+	wake_up_process(data->task);
+	return 1;
+}
+
+/**
+ * rq_qos_wait - throttle on a rqw if we need to
+ * @private_data - caller provided specific data
+ * @acquire_inflight_cb - inc the rqw->inflight counter if we can
+ * @cleanup_cb - the callback to cleanup in case we race with a waker
+ *
+ * This provides a uniform place for the rq_qos users to do their throttling.
+ * Since you can end up with a lot of things sleeping at once, this manages the
+ * waking up based on the resources available.  The acquire_inflight_cb should
+ * inc the rqw->inflight if we have the ability to do so, or return false if not
+ * and then we will sleep until the room becomes available.
+ *
+ * cleanup_cb is in case that we race with a waker and need to cleanup the
+ * inflight count accordingly.
+ */
+void rq_qos_wait(struct rq_wait *rqw, void *private_data,
+		 acquire_inflight_cb_t *acquire_inflight_cb,
+		 cleanup_cb_t *cleanup_cb)
+{
+	struct rq_qos_wait_data data = {
+		.wq = {
+			.func	= rq_qos_wake_function,
+			.entry	= LIST_HEAD_INIT(data.wq.entry),
+		},
+		.task = current,
+		.rqw = rqw,
+		.cb = acquire_inflight_cb,
+		.private_data = private_data,
+	};
+	bool has_sleeper;
+
+	has_sleeper = wq_has_sleeper(&rqw->wait);
+	if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+		return;
+
+	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+	do {
+		if (data.got_token)
+			break;
+		if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+			finish_wait(&rqw->wait, &data.wq);
+
+			/*
+			 * We raced with wbt_wake_function() getting a token,
+			 * which means we now have two. Put our local token
+			 * and wake anyone else potentially waiting for one.
+			 */
+			if (data.got_token)
+				cleanup_cb(rqw, private_data);
+			break;
+		}
+		io_schedule();
+		has_sleeper = false;
+	} while (1);
+	finish_wait(&rqw->wait, &data.wq);
+}
+
 void rq_qos_exit(struct request_queue *q)
 {
 	while (q->rq_qos) {
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 6e09e98b93ea..8678875de420 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -93,6 +93,12 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
 	}
 }
 
+typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
+typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
+
+void rq_qos_wait(struct rq_wait *rqw, void *private_data,
+		 acquire_inflight_cb_t *acquire_inflight_cb,
+		 cleanup_cb_t *cleanup_cb);
 bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
 void rq_depth_scale_up(struct rq_depth *rqd);
 void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);

From b6c7b58f5fcc2386bddf9852011c42c1d2b83979 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 4 Dec 2018 12:59:03 -0500
Subject: [PATCH 212/351] block: convert wbt_wait() to use rq_qos_wait()

Now that we have rq_qos_wait() in place, convert wbt_wait() over to
using it with it's specific callbacks.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-wbt.c | 65 +++++++++----------------------------------------
 1 file changed, 11 insertions(+), 54 deletions(-)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index d051ebfb4852..40207edd1d89 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 }
 
 struct wbt_wait_data {
-	struct wait_queue_entry wq;
-	struct task_struct *task;
 	struct rq_wb *rwb;
-	struct rq_wait *rqw;
+	enum wbt_flags wb_acct;
 	unsigned long rw;
-	bool got_token;
 };
 
-static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode,
-			     int wake_flags, void *key)
+static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
 {
-	struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data,
-							wq);
+	struct wbt_wait_data *data = private_data;
+	return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
+}
 
-	/*
-	 * If we fail to get a budget, return -1 to interrupt the wake up
-	 * loop in __wake_up_common.
-	 */
-	if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw)))
-		return -1;
-
-	data->got_token = true;
-	list_del_init(&curr->entry);
-	wake_up_process(data->task);
-	return 1;
+static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
+{
+	struct wbt_wait_data *data = private_data;
+	wbt_rqw_done(data->rwb, rqw, data->wb_acct);
 }
 
 /*
@@ -525,45 +515,12 @@ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
 {
 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
 	struct wbt_wait_data data = {
-		.wq = {
-			.func	= wbt_wake_function,
-			.entry	= LIST_HEAD_INIT(data.wq.entry),
-		},
-		.task = current,
 		.rwb = rwb,
-		.rqw = rqw,
+		.wb_acct = wb_acct,
 		.rw = rw,
 	};
-	bool has_sleeper;
 
-	has_sleeper = wq_has_sleeper(&rqw->wait);
-	if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw)))
-		return;
-
-	prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
-	do {
-		if (data.got_token)
-			break;
-
-		if (!has_sleeper &&
-		    rq_wait_inc_below(rqw, get_limit(rwb, rw))) {
-			finish_wait(&rqw->wait, &data.wq);
-
-			/*
-			 * We raced with wbt_wake_function() getting a token,
-			 * which means we now have two. Put our local token
-			 * and wake anyone else potentially waiting for one.
-			 */
-			if (data.got_token)
-				wbt_rqw_done(rwb, rqw, wb_acct);
-			break;
-		}
-
-		io_schedule();
-		has_sleeper = false;
-	} while (1);
-
-	finish_wait(&rqw->wait, &data.wq);
+	rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
 }
 
 static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)

From d3fcdff19054575a368dfdac7407cabffea36c43 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@toxicpanda.com>
Date: Tue, 4 Dec 2018 12:59:04 -0500
Subject: [PATCH 213/351] block: convert io-latency to use rq_qos_wait

Now that we have this common helper, convert io-latency over to use it
as well.

Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 31 ++++++++-----------------------
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 0b14c3d57769..bee092727cad 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -262,15 +262,15 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat,
 				   stat->rqs.mean);
 }
 
-static inline bool iolatency_may_queue(struct iolatency_grp *iolat,
-				       wait_queue_entry_t *wait,
-				       bool first_block)
+static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
 {
-	struct rq_wait *rqw = &iolat->rq_wait;
+	atomic_dec(&rqw->inflight);
+	wake_up(&rqw->wait);
+}
 
-	if (first_block && waitqueue_active(&rqw->wait) &&
-	    rqw->wait.head.next != &wait->entry)
-		return false;
+static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
+{
+	struct iolatency_grp *iolat = private_data;
 	return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
 }
 
@@ -281,8 +281,6 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
 {
 	struct rq_wait *rqw = &iolat->rq_wait;
 	unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
-	DEFINE_WAIT(wait);
-	bool first_block = true;
 
 	if (use_delay)
 		blkcg_schedule_throttle(rqos->q, use_memdelay);
@@ -299,20 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
 		return;
 	}
 
-	if (iolatency_may_queue(iolat, &wait, first_block))
-		return;
-
-	do {
-		prepare_to_wait_exclusive(&rqw->wait, &wait,
-					  TASK_UNINTERRUPTIBLE);
-
-		if (iolatency_may_queue(iolat, &wait, first_block))
-			break;
-		first_block = false;
-		io_schedule();
-	} while (1);
-
-	finish_wait(&rqw->wait, &wait);
+	rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb);
 }
 
 #define SCALE_DOWN_FACTOR 2

From 4705de735b3383792c84a92e57508d6865caa85f Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Thu, 6 Dec 2018 12:49:38 -0500
Subject: [PATCH 214/351] blkcg: put back rcu lock in blkcg_bio_issue_check()

I was a little overzealous in removing the rcu_read_lock() call from
blkcg_bio_issue_check() and it broke blk-throttle. Put it back.

Fixes: e35403a034bf ("blkcg: associate blkg when associating a device")
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk-cgroup.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 752de1becb5c..bf13ecb0fe4f 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -764,6 +764,8 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	struct blkcg_gq *blkg;
 	bool throtl = false;
 
+	rcu_read_lock();
+
 	if (!bio->bi_blkg) {
 		char b[BDEVNAME_SIZE];
 
@@ -791,6 +793,7 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 
 	blkcg_bio_issue_init(bio);
 
+	rcu_read_unlock();
 	return !throtl;
 }
 

From 5938870247be4453ef6602c7ce467bebb48113c8 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Fri, 7 Dec 2018 11:03:53 +0800
Subject: [PATCH 215/351] blk-mq: re-build queue map in case of kdump kernel

Now almost all .map_queues() implementation based on managed irq
affinity doesn't update queue mapping and it just retrieves the
old built mapping, so if nr_hw_queues is changed, the mapping talbe
includes stale mapping. And only blk_mq_map_queues() may rebuild
the mapping talbe.

One case is that we limit .nr_hw_queues as 1 in case of kdump kernel.
However, drivers often builds queue mapping before allocating tagset
via pci_alloc_irq_vectors_affinity(), but set->nr_hw_queues can be set
as 1 in case of kdump kernel, so wrong queue mapping is used, and
kernel panic[1] is observed during booting.

This patch fixes the kernel panic triggerd on nvme by rebulding the
mapping table via blk_mq_map_queues().

[1] kernel panic log
[    4.438371] nvme nvme0: 16/0/0 default/read/poll queues
[    4.443277] BUG: unable to handle kernel NULL pointer dereference at 0000000000000098
[    4.444681] PGD 0 P4D 0
[    4.445367] Oops: 0000 [#1] SMP NOPTI
[    4.446342] CPU: 3 PID: 201 Comm: kworker/u33:10 Not tainted 4.20.0-rc5-00664-g5eb02f7ee1eb-dirty #459
[    4.447630] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-2.fc27 04/01/2014
[    4.448689] Workqueue: nvme-wq nvme_scan_work [nvme_core]
[    4.449368] RIP: 0010:blk_mq_map_swqueue+0xfb/0x222
[    4.450596] Code: 04 f5 20 28 ef 81 48 89 c6 39 55 30 76 93 89 d0 48 c1 e0 04 48 03 83 f8 05 00 00 48 8b 00 42 8b 3c 28 48 8b 43 58 48 8b 04 f8 <48> 8b b8 98 00 00 00 4c 0f a3 37 72 42 f0 4c 0f ab 37 66 8b b8 f6
[    4.453132] RSP: 0018:ffffc900023b3cd8 EFLAGS: 00010286
[    4.454061] RAX: 0000000000000000 RBX: ffff888174448000 RCX: 0000000000000001
[    4.456480] RDX: 0000000000000001 RSI: ffffe8feffc506c0 RDI: 0000000000000001
[    4.458750] RBP: ffff88810722d008 R08: ffff88817647a880 R09: 0000000000000002
[    4.464580] R10: ffffc900023b3c10 R11: 0000000000000004 R12: ffff888174448538
[    4.467803] R13: 0000000000000004 R14: 0000000000000001 R15: 0000000000000001
[    4.469220] FS:  0000000000000000(0000) GS:ffff88817bac0000(0000) knlGS:0000000000000000
[    4.471554] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.472464] CR2: 0000000000000098 CR3: 0000000174e4e001 CR4: 0000000000760ee0
[    4.474264] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    4.476007] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    4.477061] PKRU: 55555554
[    4.477464] Call Trace:
[    4.478731]  blk_mq_init_allocated_queue+0x36a/0x3ad
[    4.479595]  blk_mq_init_queue+0x32/0x4e
[    4.480178]  nvme_validate_ns+0x98/0x623 [nvme_core]
[    4.480963]  ? nvme_submit_sync_cmd+0x1b/0x20 [nvme_core]
[    4.481685]  ? nvme_identify_ctrl.isra.8+0x70/0xa0 [nvme_core]
[    4.482601]  nvme_scan_work+0x23a/0x29b [nvme_core]
[    4.483269]  ? _raw_spin_unlock_irqrestore+0x25/0x38
[    4.483930]  ? try_to_wake_up+0x38d/0x3b3
[    4.484478]  ? process_one_work+0x179/0x2fc
[    4.485118]  process_one_work+0x1d3/0x2fc
[    4.485655]  ? rescuer_thread+0x2ae/0x2ae
[    4.486196]  worker_thread+0x1e9/0x2be
[    4.486841]  kthread+0x115/0x11d
[    4.487294]  ? kthread_park+0x76/0x76
[    4.487784]  ret_from_fork+0x3a/0x50
[    4.488322] Modules linked in: nvme nvme_core qemu_fw_cfg virtio_scsi ip_tables
[    4.489428] Dumping ftrace buffer:
[    4.489939]    (ftrace buffer empty)
[    4.490492] CR2: 0000000000000098
[    4.491052] ---[ end trace 03cd268ad5a86ff7 ]---

Cc: Christoph Hellwig <hch@lst.de>
Cc: linux-nvme@lists.infradead.org
Cc: David Milburn <dmilburn@redhat.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index eabc7fcd96db..7f478ae288af 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2936,7 +2936,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 
 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
 {
-	if (set->ops->map_queues) {
+	if (set->ops->map_queues && !is_kdump_kernel()) {
 		int i;
 
 		/*
@@ -3006,6 +3006,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	 */
 	if (is_kdump_kernel()) {
 		set->nr_hw_queues = 1;
+		set->nr_maps = 1;
 		set->queue_depth = min(64U, set->queue_depth);
 	}
 	/*
@@ -3027,7 +3028,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 						  GFP_KERNEL, set->numa_node);
 		if (!set->map[i].mq_map)
 			goto out_free_mq_map;
-		set->map[i].nr_queues = set->nr_hw_queues;
+		set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
 	}
 
 	ret = blk_mq_update_queue_map(set);

From 1190203555ec58d768ea802e8fdbb5c42187d9d5 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 29 Oct 2018 16:44:18 -0700
Subject: [PATCH 216/351] nvme: consolidate memset calls in the nvme_setup_cmd
 path

In function nvme_setup_cmd() we call command specific setup function
for flush, rw, and discard. Instead of calling memset in each function
lets call it once in the parent function.

This is purely code cleanup patch and it does not change any existing
functionality.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3ce7fc9df378..5c4f9402f4cd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -536,7 +536,6 @@ static void nvme_assign_write_stream(struct nvme_ctrl *ctrl,
 static inline void nvme_setup_flush(struct nvme_ns *ns,
 		struct nvme_command *cmnd)
 {
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->common.opcode = nvme_cmd_flush;
 	cmnd->common.nsid = cpu_to_le32(ns->head->ns_id);
 }
@@ -569,7 +568,6 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 		return BLK_STS_IOERR;
 	}
 
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
 	cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->dsm.nr = cpu_to_le32(segments - 1);
@@ -598,7 +596,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
 
-	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
 	cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id);
 	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
@@ -663,6 +660,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 
 	nvme_clear_nvme_request(req);
 
+	memset(cmd, 0, sizeof(*cmd));
 	switch (req_op(req)) {
 	case REQ_OP_DRV_IN:
 	case REQ_OP_DRV_OUT:

From 103e515efa89be33d04e45aae82de136f0c49865 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Fri, 16 Nov 2018 09:22:29 +0100
Subject: [PATCH 217/351] nvme: add a numa_node field to struct nvme_ctrl

Instead of directly poking into the struct device add a new numa_node
field to struct nvme_ctrl.  This allows fabrics drivers where ctrl->dev
is a virtual device to support NUMA affinity as well.

Also expose the field as a sysfs attribute, and populate it for the
RDMA and FC transports.

Signed-off-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c      | 4 +++-
 drivers/nvme/host/fc.c        | 5 +++--
 drivers/nvme/host/multipath.c | 4 ++--
 drivers/nvme/host/nvme.h      | 1 +
 drivers/nvme/host/rdma.c      | 5 +++--
 5 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 5c4f9402f4cd..e57c673ae737 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2766,6 +2766,7 @@ static ssize_t  field##_show(struct device *dev,				\
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
 nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
 				struct device_attribute *attr, const char *buf,
@@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_subsysnqn.attr,
 	&dev_attr_address.attr,
 	&dev_attr_state.attr,
+	&dev_attr_numa_node.attr,
 	NULL
 };
 
@@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
-	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+	int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e6994e4ce17e..b79e41938513 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2425,7 +2425,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
 	ctrl->tag_set.ops = &nvme_fc_mq_ops;
 	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
 	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
-	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
 	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ctrl->tag_set.cmd_size =
 		struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3018,6 +3018,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	ctrl->ctrl.opts = opts;
 	ctrl->ctrl.nr_reconnects = 0;
+	ctrl->ctrl.numa_node = dev_to_node(lport->dev);
 	INIT_LIST_HEAD(&ctrl->ctrl_list);
 	ctrl->lport = lport;
 	ctrl->rport = rport;
@@ -3058,7 +3059,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
 	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
-	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
 	ctrl->admin_tag_set.cmd_size =
 		struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
 			    ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index ec310b1b9267..183ec17ba067 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+		distance = node_distance(node, ns->ctrl->numa_node);
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -261,7 +261,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
+	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node);
 	if (!q)
 		goto out;
 	q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ae77eb16fd1f..f1e4566e6ca8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
 	struct request_queue *connect_q;
 	struct device *dev;
 	int instance;
+	int numa_node;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 75c01d20a133..f2db848f6985 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -694,7 +694,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_admin_mq_ops;
 		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		set->reserved_tags = 2; /* connect + keep-alive */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->numa_node;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
 		set->driver_data = ctrl;
@@ -707,7 +707,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_mq_ops;
 		set->queue_depth = nctrl->sqsize + 1;
 		set->reserved_tags = 1; /* fabric connect */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->numa_node;
 		set->flags = BLK_MQ_F_SHOULD_MERGE;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -763,6 +763,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		return error;
 
 	ctrl->device = ctrl->queues[0].device;
+	ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
 
 	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 

From 12b2117161ddbdcdb69777404c5aa2a9fe6ad7d5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:12 -0700
Subject: [PATCH 218/351] nvme: introduce ctrl attributes enumeration

We are growing more controller attributes, so use a proper enumeration
for it.  For now just add the 128-bit hostid which we support.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 2 +-
 include/linux/nvme.h            | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 1179f6314323..30778ffc46f5 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -304,7 +304,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
-	id->ctratt = cpu_to_le32(1 << 0);
+	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT);
 
 	id->oacs = 0;
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 818dbe9331be..753c83a5c01f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -198,6 +198,10 @@ enum {
 	NVME_PS_FLAGS_NON_OP_STATE	= 1 << 1,
 };
 
+enum nvme_ctrl_attr {
+	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
+};
+
 struct nvme_id_ctrl {
 	__le16			vid;
 	__le16			ssvid;

From 3e53ba38a9404434a8b57683825279f4305b5a76 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:14 -0700
Subject: [PATCH 219/351] nvme: cache controller attributes

We get the controller attributes in identify, cache them as we'll need
them for traffic based keep alive support.

Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 1 +
 drivers/nvme/host/nvme.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e57c673ae737..9de6244a345c 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2409,6 +2409,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ctrl->sgls = le32_to_cpu(id->sgls);
 	ctrl->kas = le16_to_cpu(id->kas);
 	ctrl->max_namespaces = le32_to_cpu(id->mnan);
+	ctrl->ctratt = le32_to_cpu(id->ctratt);
 
 	if (id->rtd3e) {
 		/* us -> s */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f1e4566e6ca8..4be7bbcfe66d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -194,6 +194,7 @@ struct nvme_ctrl {
 	u8 apsta;
 	u32 oaes;
 	u32 aen_result;
+	u32 ctratt;
 	unsigned int shutdown_timeout;
 	unsigned int kato;
 	bool subsystem;

From 6e3ca03ee934572d5de4fb2224c01e12c4d422c8 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:15 -0700
Subject: [PATCH 220/351] nvme: support traffic based keep-alive

If the controller supports traffic based keep alive, we restart the keep
alive timer if any admin or io commands was completed during the kato
period.  This prevents a possible starvation of keep alive commands in
the presence of heavy traffic as in such case, we already have a health
indication from the host perspective.

Only set a comp_seen indicator in case the controller supports keep
alive to minimize the overhead for pci controllers.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 13 +++++++++++++
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     |  1 +
 3 files changed, 15 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9de6244a345c..48ffb1d685c2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -251,6 +251,9 @@ void nvme_complete_rq(struct request *req)
 
 	trace_nvme_complete_rq(req);
 
+	if (nvme_req(req)->ctrl->kas)
+		nvme_req(req)->ctrl->comp_seen = true;
+
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
 		if ((req->cmd_flags & REQ_NVME_MPATH) &&
 		    blk_path_error(status)) {
@@ -839,6 +842,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
 		return;
 	}
 
+	ctrl->comp_seen = false;
 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
 
@@ -863,6 +867,15 @@ static void nvme_keep_alive_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
 			struct nvme_ctrl, ka_work);
+	bool comp_seen = ctrl->comp_seen;
+
+	if ((ctrl->ctratt & NVME_CTRL_ATTR_TBKAS) && comp_seen) {
+		dev_dbg(ctrl->device,
+			"reschedule traffic based keep-alive timer\n");
+		ctrl->comp_seen = false;
+		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+		return;
+	}
 
 	if (nvme_keep_alive(ctrl)) {
 		/* allocation failure, reset the controller */
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 4be7bbcfe66d..f2594d468f29 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -145,6 +145,7 @@ enum nvme_ctrl_state {
 };
 
 struct nvme_ctrl {
+	bool comp_seen;
 	enum nvme_ctrl_state state;
 	bool identified;
 	spinlock_t lock;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 753c83a5c01f..429c4cf90899 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -200,6 +200,7 @@ enum {
 
 enum nvme_ctrl_attr {
 	NVME_CTRL_ATTR_HID_128_BIT	= (1 << 0),
+	NVME_CTRL_ATTR_TBKAS		= (1 << 6),
 };
 
 struct nvme_id_ctrl {

From c09305ae49970e15cd18828c0f78b766e8cf224f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Fri, 2 Nov 2018 10:28:13 -0700
Subject: [PATCH 221/351] nvmet: support for traffic based keep-alive

A controller that supports traffic based keep-alive can restart the keep
alive timer even when no keep-alive was not received in the kato period
as long as other admin or I/O commands were received.  For each command
set ctrl->cmd_seen to true, and when keep-alive timer expires, if any
commands were seen, resched ka_work instead of escalating to a fatal
error.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  3 ++-
 drivers/nvme/target/core.c      | 12 ++++++++++++
 drivers/nvme/target/nvmet.h     |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 30778ffc46f5..c9c6d25a3ec2 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -304,7 +304,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
-	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT);
+	id->ctratt = cpu_to_le32(NVME_CTRL_ATTR_HID_128_BIT |
+		NVME_CTRL_ATTR_TBKAS);
 
 	id->oacs = 0;
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index a5f9bbce863f..05d8d9bc058f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -299,6 +299,15 @@ static void nvmet_keep_alive_timer(struct work_struct *work)
 {
 	struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work),
 			struct nvmet_ctrl, ka_work);
+	bool cmd_seen = ctrl->cmd_seen;
+
+	ctrl->cmd_seen = false;
+	if (cmd_seen) {
+		pr_debug("ctrl %d reschedule traffic based keep-alive timer\n",
+			ctrl->cntlid);
+		schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+		return;
+	}
 
 	pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n",
 		ctrl->cntlid, ctrl->kato);
@@ -801,6 +810,9 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		goto fail;
 	}
 
+	if (sq->ctrl)
+		sq->ctrl->cmd_seen = true;
+
 	return true;
 
 fail:
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c2b4d9ee6391..860218edeb6c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -163,6 +163,8 @@ struct nvmet_ctrl {
 	struct nvmet_cq		**cqs;
 	struct nvmet_sq		**sqs;
 
+	bool			cmd_seen;
+
 	struct mutex		lock;
 	u64			cap;
 	u32			cc;

From 50a909db36f244bcfec6e02598d31c0b0a468175 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 14 Nov 2018 01:12:19 -0500
Subject: [PATCH 222/351] nvmet: use IOCB_NOWAIT for file-ns buffered I/O

This patch optimizes read command behavior when file-ns configured
with buffered I/O. Instead of offloading the buffered I/O read operations
to the worker threads, we first issue the read operation with IOCB_NOWAIT
and try and access the data from the cache. Here we only offload the
request to the worker thread and complete the request in the worker
thread context when IOCB_NOWAIT request fails.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd-file.c | 130 +++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 46 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 01feebec29ea..12eaa8ddc248 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -83,17 +83,16 @@ static void nvmet_file_init_bvec(struct bio_vec *bv, struct sg_page_iter *iter)
 }
 
 static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
-		unsigned long nr_segs, size_t count)
+		unsigned long nr_segs, size_t count, int ki_flags)
 {
 	struct kiocb *iocb = &req->f.iocb;
 	ssize_t (*call_iter)(struct kiocb *iocb, struct iov_iter *iter);
 	struct iov_iter iter;
-	int ki_flags = 0, rw;
-	ssize_t ret;
+	int rw;
 
 	if (req->cmd->rw.opcode == nvme_cmd_write) {
 		if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
-			ki_flags = IOCB_DSYNC;
+			ki_flags |= IOCB_DSYNC;
 		call_iter = req->ns->file->f_op->write_iter;
 		rw = WRITE;
 	} else {
@@ -107,12 +106,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 	iocb->ki_filp = req->ns->file;
 	iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
 
-	ret = call_iter(iocb, &iter);
-
-	if (ret != -EIOCBQUEUED && iocb->ki_complete)
-		iocb->ki_complete(iocb, ret, 0);
-
-	return ret;
+	return call_iter(iocb, &iter);
 }
 
 static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
@@ -130,7 +124,7 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 			NVME_SC_INTERNAL | NVME_SC_DNR : 0);
 }
 
-static void nvmet_file_execute_rw(struct nvmet_req *req)
+static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
 {
 	ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
 	struct sg_page_iter sg_pg_iter;
@@ -140,30 +134,14 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 	ssize_t ret = 0;
 	loff_t pos;
 
-	if (!req->sg_cnt || !nr_bvec) {
-		nvmet_req_complete(req, 0);
-		return;
-	}
+
+	if (req->f.mpool_alloc && nr_bvec > NVMET_MAX_MPOOL_BVEC)
+		is_sync = true;
 
 	pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
 	if (unlikely(pos + req->data_len > req->ns->size)) {
 		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
-		return;
-	}
-
-	if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
-		req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
-				GFP_KERNEL);
-	else
-		req->f.bvec = req->inline_bvec;
-
-	req->f.mpool_alloc = false;
-	if (unlikely(!req->f.bvec)) {
-		/* fallback under memory pressure */
-		req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
-		req->f.mpool_alloc = true;
-		if (nr_bvec > NVMET_MAX_MPOOL_BVEC)
-			is_sync = true;
+		return true;
 	}
 
 	memset(&req->f.iocb, 0, sizeof(struct kiocb));
@@ -177,9 +155,10 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 
 		if (unlikely(is_sync) &&
 		    (nr_bvec - 1 == 0 || bv_cnt == NVMET_MAX_MPOOL_BVEC)) {
-			ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len);
+			ret = nvmet_file_submit_bvec(req, pos, bv_cnt, len, 0);
 			if (ret < 0)
-				goto out;
+				goto complete;
+
 			pos += len;
 			bv_cnt = 0;
 			len = 0;
@@ -187,30 +166,92 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 		nr_bvec--;
 	}
 
-	if (WARN_ON_ONCE(total_len != req->data_len))
+	if (WARN_ON_ONCE(total_len != req->data_len)) {
 		ret = -EIO;
-out:
-	if (unlikely(is_sync || ret)) {
-		nvmet_file_io_done(&req->f.iocb, ret < 0 ? ret : total_len, 0);
-		return;
+		goto complete;
 	}
-	req->f.iocb.ki_complete = nvmet_file_io_done;
-	nvmet_file_submit_bvec(req, pos, bv_cnt, total_len);
+
+	if (unlikely(is_sync)) {
+		ret = total_len;
+		goto complete;
+	}
+
+	/*
+	 * A NULL ki_complete ask for synchronous execution, which we want
+	 * for the IOCB_NOWAIT case.
+	 */
+	if (!(ki_flags & IOCB_NOWAIT))
+		req->f.iocb.ki_complete = nvmet_file_io_done;
+
+	ret = nvmet_file_submit_bvec(req, pos, bv_cnt, total_len, ki_flags);
+
+	switch (ret) {
+	case -EIOCBQUEUED:
+		return true;
+	case -EAGAIN:
+		if (WARN_ON_ONCE(!(ki_flags & IOCB_NOWAIT)))
+			goto complete;
+		return false;
+	case -EOPNOTSUPP:
+		/*
+		 * For file systems returning error -EOPNOTSUPP, handle
+		 * IOCB_NOWAIT error case separately and retry without
+		 * IOCB_NOWAIT.
+		 */
+		if ((ki_flags & IOCB_NOWAIT))
+			return false;
+		break;
+	}
+
+complete:
+	nvmet_file_io_done(&req->f.iocb, ret, 0);
+	return true;
 }
 
 static void nvmet_file_buffered_io_work(struct work_struct *w)
 {
 	struct nvmet_req *req = container_of(w, struct nvmet_req, f.work);
 
-	nvmet_file_execute_rw(req);
+	nvmet_file_execute_io(req, 0);
 }
 
-static void nvmet_file_execute_rw_buffered_io(struct nvmet_req *req)
+static void nvmet_file_submit_buffered_io(struct nvmet_req *req)
 {
 	INIT_WORK(&req->f.work, nvmet_file_buffered_io_work);
 	queue_work(buffered_io_wq, &req->f.work);
 }
 
+static void nvmet_file_execute_rw(struct nvmet_req *req)
+{
+	ssize_t nr_bvec = DIV_ROUND_UP(req->data_len, PAGE_SIZE);
+
+	if (!req->sg_cnt || !nr_bvec) {
+		nvmet_req_complete(req, 0);
+		return;
+	}
+
+	if (nr_bvec > NVMET_MAX_INLINE_BIOVEC)
+		req->f.bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+				GFP_KERNEL);
+	else
+		req->f.bvec = req->inline_bvec;
+
+	if (unlikely(!req->f.bvec)) {
+		/* fallback under memory pressure */
+		req->f.bvec = mempool_alloc(req->ns->bvec_pool, GFP_KERNEL);
+		req->f.mpool_alloc = true;
+	} else
+		req->f.mpool_alloc = false;
+
+	if (req->ns->buffered_io) {
+		if (likely(!req->f.mpool_alloc) &&
+				nvmet_file_execute_io(req, IOCB_NOWAIT))
+			return;
+		nvmet_file_submit_buffered_io(req);
+	} else
+		nvmet_file_execute_io(req, 0);
+}
+
 u16 nvmet_file_flush(struct nvmet_req *req)
 {
 	if (vfs_fsync(req->ns->file, 1) < 0)
@@ -320,10 +361,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
 	switch (cmd->common.opcode) {
 	case nvme_cmd_read:
 	case nvme_cmd_write:
-		if (req->ns->buffered_io)
-			req->execute = nvmet_file_execute_rw_buffered_io;
-		else
-			req->execute = nvmet_file_execute_rw;
+		req->execute = nvmet_file_execute_rw;
 		req->data_len = nvmet_rw_len(req);
 		return 0;
 	case nvme_cmd_flush:

From 6c8312ad509c3aaaa8720fee7abe45b813cd4d87 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:33 -0800
Subject: [PATCH 223/351] nvmet: provide aen bit functions for multiple
 controller types

Move nvmet_aen_disabled and nvmet_clear_aen in preparation for other types
of controllers to use, initially the discovery controller.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 13 -------------
 drivers/nvme/target/core.c      |  7 -------
 drivers/nvme/target/nvmet.h     | 15 +++++++++++++++
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c9c6d25a3ec2..e82262c988f1 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -19,19 +19,6 @@
 #include <asm/unaligned.h>
 #include "nvmet.h"
 
-/*
- * This helper allows us to clear the AEN based on the RAE bit,
- * Please use this helper when processing the log pages which are
- * associated with the AEN.
- */
-static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
-{
-	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
-
-	if (!rae)
-		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
-}
-
 u32 nvmet_get_log_page_len(struct nvme_command *cmd)
 {
 	u32 len = le16_to_cpu(cmd->get_log_page.numdu);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 05d8d9bc058f..f33c4a20b572 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -150,13 +150,6 @@ static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 	schedule_work(&ctrl->async_event_work);
 }
 
-static bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
-{
-	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
-		return true;
-	return test_and_set_bit(aen, &ctrl->aen_masked);
-}
-
 static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 {
 	u32 i;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 860218edeb6c..7efee345d467 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -342,6 +342,21 @@ struct nvmet_async_event {
 	u8			log_page;
 };
 
+static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+{
+	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+
+	if (!rae)
+		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+}
+
+static inline bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+{
+	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+		return true;
+	return test_and_set_bit(aen, &ctrl->aen_masked);
+}
+
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);

From 7114ddeb40c0ccc584d86df598da4054ca4cd79f Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:34 -0800
Subject: [PATCH 224/351] nvmet: change aen mask functions to use bit numbers

Functions nvmet_aen_disabled and nvmet_clear_aen were using
values not bit numbers ie 1 << 9 not 9 for bit function clear_bit
and test_and_set_bit.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  4 ++--
 drivers/nvme/target/core.c      |  4 ++--
 drivers/nvme/target/nvmet.h     | 10 +++++-----
 include/linux/nvme.h            | 12 +++++++++---
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index e82262c988f1..2e89f4e3364b 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -176,7 +176,7 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
 	if (!status)
 		status = nvmet_zero_sgl(req, len, req->data_len - len);
 	ctrl->nr_changed_ns = 0;
-	nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_NS_ATTR);
 	mutex_unlock(&ctrl->lock);
 out:
 	nvmet_req_complete(req, status);
@@ -239,7 +239,7 @@ static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
 
 	hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
 	hdr.ngrps = cpu_to_le16(ngrps);
-	nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_ANA_CHANGE);
 	up_read(&nvmet_ana_sem);
 
 	kfree(desc);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index f33c4a20b572..f42a105ef17f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -180,7 +180,7 @@ void nvmet_ns_changed(struct nvmet_subsys *subsys, u32 nsid)
 
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		nvmet_add_to_changed_ns_log(ctrl, cpu_to_le32(nsid));
-		if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_NS_ATTR))
+		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_NS_ATTR))
 			continue;
 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 				NVME_AER_NOTICE_NS_CHANGED,
@@ -197,7 +197,7 @@ void nvmet_send_ana_event(struct nvmet_subsys *subsys,
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		if (port && ctrl->port != port)
 			continue;
-		if (nvmet_aen_disabled(ctrl, NVME_AEN_CFG_ANA_CHANGE))
+		if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_ANA_CHANGE))
 			continue;
 		nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
 				NVME_AER_NOTICE_ANA, NVME_LOG_ANA);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7efee345d467..8ddc54fa98c7 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -342,19 +342,19 @@ struct nvmet_async_event {
 	u8			log_page;
 };
 
-static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
+static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
 {
 	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
 
 	if (!rae)
-		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
+		clear_bit(bn, &req->sq->ctrl->aen_masked);
 }
 
-static inline bool nvmet_aen_disabled(struct nvmet_ctrl *ctrl, u32 aen)
+static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
 {
-	if (!(READ_ONCE(ctrl->aen_enabled) & aen))
+	if (!(READ_ONCE(ctrl->aen_enabled) & (1 << bn)))
 		return true;
-	return test_and_set_bit(aen, &ctrl->aen_masked);
+	return test_and_set_bit(bn, &ctrl->aen_masked);
 }
 
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 429c4cf90899..d6cfa194be80 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -489,9 +489,15 @@ enum {
 };
 
 enum {
-	NVME_AEN_CFG_NS_ATTR		= 1 << 8,
-	NVME_AEN_CFG_FW_ACT		= 1 << 9,
-	NVME_AEN_CFG_ANA_CHANGE		= 1 << 11,
+	NVME_AEN_BIT_NS_ATTR		= 8,
+	NVME_AEN_BIT_FW_ACT		= 9,
+	NVME_AEN_BIT_ANA_CHANGE		= 11,
+};
+
+enum {
+	NVME_AEN_CFG_NS_ATTR		= 1 << NVME_AEN_BIT_NS_ATTR,
+	NVME_AEN_CFG_FW_ACT		= 1 << NVME_AEN_BIT_FW_ACT,
+	NVME_AEN_CFG_ANA_CHANGE		= 1 << NVME_AEN_BIT_ANA_CHANGE,
 };
 
 struct nvme_lba_range_type {

From f9362ac1738a41cc526fde82e76beb034d8c6053 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:35 -0800
Subject: [PATCH 225/351] nvmet: allow Keep Alive for Discovery controller

Per change to specification allowing Discovery controllers to have
explicit persistent connections, remove restriction on Discovery
controllers allowing kato on connect.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c |  2 +-
 drivers/nvme/target/core.c      | 36 +++++++++------------------------
 drivers/nvme/target/discovery.c |  4 ++++
 drivers/nvme/target/nvmet.h     |  4 +++-
 4 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 2e89f4e3364b..ad94119f6a51 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -726,7 +726,7 @@ static void nvmet_execute_async_event(struct nvmet_req *req)
 	schedule_work(&ctrl->async_event_work);
 }
 
-static void nvmet_execute_keep_alive(struct nvmet_req *req)
+void nvmet_execute_keep_alive(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index f42a105ef17f..46345c3f9d4a 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1180,31 +1180,17 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	ctrl->cntlid = ret;
 
 	ctrl->ops = req->ops;
-	if (ctrl->subsys->type == NVME_NQN_DISC) {
-		/* Don't accept keep-alive timeout for discovery controllers */
-		if (kato) {
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			goto out_remove_ida;
-		}
 
-		/*
-		 * Discovery controllers use some arbitrary high value in order
-		 * to cleanup stale discovery sessions
-		 *
-		 * From the latest base diff RC:
-		 * "The Keep Alive command is not supported by
-		 * Discovery controllers. A transport may specify a
-		 * fixed Discovery controller activity timeout value
-		 * (e.g., 2 minutes).  If no commands are received
-		 * by a Discovery controller within that time
-		 * period, the controller may perform the
-		 * actions for Keep Alive Timer expiration".
-		 */
-		ctrl->kato = NVMET_DISC_KATO;
-	} else {
-		/* keep-alive timeout in seconds */
-		ctrl->kato = DIV_ROUND_UP(kato, 1000);
-	}
+	/*
+	 * Discovery controllers may use some arbitrary high value
+	 * in order to cleanup stale discovery sessions
+	 */
+	if ((ctrl->subsys->type == NVME_NQN_DISC) && !kato)
+		kato = NVMET_DISC_KATO_MS;
+
+	/* keep-alive timeout in seconds */
+	ctrl->kato = DIV_ROUND_UP(kato, 1000);
+
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
@@ -1215,8 +1201,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	*ctrlp = ctrl;
 	return 0;
 
-out_remove_ida:
-	ida_simple_remove(&cntlid_ida, ctrl->cntlid);
 out_free_sqs:
 	kfree(ctrl->sqs);
 out_free_cqs:
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index bc0aa0bf1543..6e0f91eb34ce 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -194,6 +194,10 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	}
 
 	switch (cmd->common.opcode) {
+	case nvme_admin_keep_alive:
+		req->execute = nvmet_execute_keep_alive;
+		req->data_len = 0;
+		return 0;
 	case nvme_admin_get_log_page:
 		req->data_len = nvmet_get_log_page_len(cmd);
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8ddc54fa98c7..c525fb0510c5 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -372,6 +372,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
 int nvmet_req_alloc_sgl(struct nvmet_req *req);
 void nvmet_req_free_sgl(struct nvmet_req *req);
 
+void nvmet_execute_keep_alive(struct nvmet_req *req);
+
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
 		u16 size);
 void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid,
@@ -442,7 +444,7 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 #define NVMET_DEFAULT_ANA_GRPID	1
 
 #define NVMET_KAS		10
-#define NVMET_DISC_KATO		120
+#define NVMET_DISC_KATO_MS		120000
 
 int __init nvmet_init_configfs(void);
 void __exit nvmet_exit_configfs(void);

From 90107455cce753c05a5e1e80cb84b09da1c87eef Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:36 -0800
Subject: [PATCH 226/351] nvmet: make kato and AEN processing for use by other
 controllers

Make common process of get/set features available to other controllers by
making simple functions static inline and others not static and prototypes
in nvmet.h file

Also remove static from nvmet_execute_async_event and add prototype to
nvmet.h to allow used by other controllers

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 54 ++++++++++++++++++++++++---------
 drivers/nvme/target/nvmet.h     |  6 ++++
 2 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index ad94119f6a51..753515fc8028 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -587,11 +587,34 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 	return status;
 }
 
+u16 nvmet_set_feat_kato(struct nvmet_req *req)
+{
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+
+	req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
+
+	nvmet_set_result(req, req->sq->ctrl->kato);
+
+	return 0;
+}
+
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
+{
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+
+	if (val32 & ~mask)
+		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+	WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
+	nvmet_set_result(req, val32);
+
+	return 0;
+}
+
 static void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
-	u32 val32;
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -600,19 +623,10 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
 			(subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16));
 		break;
 	case NVME_FEAT_KATO:
-		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
-		req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
-		nvmet_set_result(req, req->sq->ctrl->kato);
+		status = nvmet_set_feat_kato(req);
 		break;
 	case NVME_FEAT_ASYNC_EVENT:
-		val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
-		if (val32 & ~NVMET_AEN_CFG_ALL) {
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			break;
-		}
-
-		WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
-		nvmet_set_result(req, val32);
+		status = nvmet_set_feat_async_event(req, NVMET_AEN_CFG_ALL);
 		break;
 	case NVME_FEAT_HOST_ID:
 		status = NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
@@ -648,6 +662,16 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	return 0;
 }
 
+void nvmet_get_feat_kato(struct nvmet_req *req)
+{
+	nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+}
+
+void nvmet_get_feat_async_event(struct nvmet_req *req)
+{
+	nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+}
+
 static void nvmet_execute_get_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
@@ -677,7 +701,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 #endif
 	case NVME_FEAT_ASYNC_EVENT:
-		nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
+		nvmet_get_feat_async_event(req);
 		break;
 	case NVME_FEAT_VOLATILE_WC:
 		nvmet_set_result(req, 1);
@@ -687,7 +711,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 			(subsys->max_qid-1) | ((subsys->max_qid-1) << 16));
 		break;
 	case NVME_FEAT_KATO:
-		nvmet_set_result(req, req->sq->ctrl->kato * 1000);
+		nvmet_get_feat_kato(req);
 		break;
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
@@ -710,7 +734,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 	nvmet_req_complete(req, status);
 }
 
-static void nvmet_execute_async_event(struct nvmet_req *req)
+void nvmet_execute_async_event(struct nvmet_req *req)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index c525fb0510c5..a8ee265a3806 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -357,6 +357,12 @@ static inline bool nvmet_aen_bit_disabled(struct nvmet_ctrl *ctrl, u32 bn)
 	return test_and_set_bit(bn, &ctrl->aen_masked);
 }
 
+void nvmet_get_feat_kato(struct nvmet_req *req);
+void nvmet_get_feat_async_event(struct nvmet_req *req);
+u16 nvmet_set_feat_kato(struct nvmet_req *req);
+u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask);
+void nvmet_execute_async_event(struct nvmet_req *req);
+
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);

From f301c2b1368905340133ff8ef4485befdd0b7e2d Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:37 -0800
Subject: [PATCH 227/351] nvmet: add defines for discovery change async events

Add AEN/AER values as defined by the specification

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/nvmet.h | 2 ++
 include/linux/nvme.h        | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index a8ee265a3806..bc99c700a583 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -36,6 +36,8 @@
  */
 #define NVMET_AEN_CFG_OPTIONAL \
 	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
+#define NVMET_DISC_AEN_CFG_OPTIONAL \
+	(NVME_AEN_CFG_DISC_CHANGE)
 
 /*
  * Plus mandatory SMART AENs (we'll never send them, but allow enabling them):
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d6cfa194be80..77d320d32ee5 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -486,18 +486,21 @@ enum {
 	NVME_AER_NOTICE_NS_CHANGED	= 0x00,
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
+	NVME_AER_NOTICE_DISC_CHANGED	= 0xf0,
 };
 
 enum {
 	NVME_AEN_BIT_NS_ATTR		= 8,
 	NVME_AEN_BIT_FW_ACT		= 9,
 	NVME_AEN_BIT_ANA_CHANGE		= 11,
+	NVME_AEN_BIT_DISC_CHANGE	= 31,
 };
 
 enum {
 	NVME_AEN_CFG_NS_ATTR		= 1 << NVME_AEN_BIT_NS_ATTR,
 	NVME_AEN_CFG_FW_ACT		= 1 << NVME_AEN_BIT_FW_ACT,
 	NVME_AEN_CFG_ANA_CHANGE		= 1 << NVME_AEN_BIT_ANA_CHANGE,
+	NVME_AEN_CFG_DISC_CHANGE	= 1 << NVME_AEN_BIT_DISC_CHANGE,
 };
 
 struct nvme_lba_range_type {

From 6a8ec0ac5ede074232137e505d4b90e56c1e4511 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:38 -0800
Subject: [PATCH 228/351] nvmet: add support to Discovery controllers for
 commands

Add custom get/set features to commands allowed by Discovery controllers.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/discovery.c | 53 +++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 6e0f91eb34ce..050e4d759b65 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -183,6 +183,47 @@ out:
 	nvmet_req_complete(req, status);
 }
 
+static void nvmet_execute_disc_set_features(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u16 stat;
+
+	switch (cdw10 & 0xff) {
+	case NVME_FEAT_KATO:
+		stat = nvmet_set_feat_kato(req);
+		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		stat = nvmet_set_feat_async_event(req,
+						  NVMET_DISC_AEN_CFG_OPTIONAL);
+		break;
+	default:
+		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, stat);
+}
+
+static void nvmet_execute_disc_get_features(struct nvmet_req *req)
+{
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u16 stat = 0;
+
+	switch (cdw10 & 0xff) {
+	case NVME_FEAT_KATO:
+		nvmet_get_feat_kato(req);
+		break;
+	case NVME_FEAT_ASYNC_EVENT:
+		nvmet_get_feat_async_event(req);
+		break;
+	default:
+		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		break;
+	}
+
+	nvmet_req_complete(req, stat);
+}
+
 u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 {
 	struct nvme_command *cmd = req->cmd;
@@ -194,6 +235,18 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	}
 
 	switch (cmd->common.opcode) {
+	case nvme_admin_set_features:
+		req->execute = nvmet_execute_disc_set_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_get_features:
+		req->execute = nvmet_execute_disc_get_features;
+		req->data_len = 0;
+		return 0;
+	case nvme_admin_async_event:
+		req->execute = nvmet_execute_async_event;
+		req->data_len = 0;
+		return 0;
 	case nvme_admin_keep_alive:
 		req->execute = nvmet_execute_keep_alive;
 		req->data_len = 0;

From 253928eec61a52935584777f0dfba6cdb63967b6 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 12 Nov 2018 13:56:39 -0800
Subject: [PATCH 229/351] nvmet: allow host connect even if no allowed
 subsystems are exported

It is perfectly valid that a host connects to a discovery subsystem
and gets an empty discovery log page since no subsystems are
provisioned to it. No reason to disallow connecting to the discovery
subsystem all together.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c      | 34 +++++++--------------------------
 drivers/nvme/target/discovery.c |  2 +-
 drivers/nvme/target/nvmet.h     |  3 +--
 3 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 46345c3f9d4a..849080b4115c 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1032,14 +1032,18 @@ u16 nvmet_check_ctrl_status(struct nvmet_req *req, struct nvme_command *cmd)
 	return 0;
 }
 
-static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
-		const char *hostnqn)
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn)
 {
 	struct nvmet_host_link *p;
 
+	lockdep_assert_held(&nvmet_config_sem);
+
 	if (subsys->allow_any_host)
 		return true;
 
+	if (subsys->type == NVME_NQN_DISC) /* allow all access to disc subsys */
+		return true;
+
 	list_for_each_entry(p, &subsys->hosts, entry) {
 		if (!strcmp(nvmet_host_name(p->host), hostnqn))
 			return true;
@@ -1048,30 +1052,6 @@ static bool __nvmet_host_allowed(struct nvmet_subsys *subsys,
 	return false;
 }
 
-static bool nvmet_host_discovery_allowed(struct nvmet_req *req,
-		const char *hostnqn)
-{
-	struct nvmet_subsys_link *s;
-
-	list_for_each_entry(s, &req->port->subsystems, entry) {
-		if (__nvmet_host_allowed(s->subsys, hostnqn))
-			return true;
-	}
-
-	return false;
-}
-
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
-		const char *hostnqn)
-{
-	lockdep_assert_held(&nvmet_config_sem);
-
-	if (subsys->type == NVME_NQN_DISC)
-		return nvmet_host_discovery_allowed(req, hostnqn);
-	else
-		return __nvmet_host_allowed(subsys, hostnqn);
-}
-
 /*
  * Note: ctrl->subsys->lock should be held when calling this function
  */
@@ -1122,7 +1102,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 
 	status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	down_read(&nvmet_config_sem);
-	if (!nvmet_host_allowed(req, subsys, hostnqn)) {
+	if (!nvmet_host_allowed(subsys, hostnqn)) {
 		pr_info("connect by host %s for subsystem %s not allowed\n",
 			hostnqn, subsysnqn);
 		req->rsp->result.u32 = IPO_IATTR_CONNECT_DATA(hostnqn);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 050e4d759b65..5fbb1bb2c4fc 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -107,7 +107,7 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 
 	down_read(&nvmet_config_sem);
 	list_for_each_entry(p, &req->port->subsystems, entry) {
-		if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn))
+		if (!nvmet_host_allowed(p->subsys, ctrl->hostnqn))
 			continue;
 		if (residual_len >= entry_size) {
 			char traddr[NVMF_TRADDR_SIZE];
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index bc99c700a583..05b98f25d65c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -468,8 +468,7 @@ extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
 extern u64 nvmet_ana_chgcnt;
 extern struct rw_semaphore nvmet_ana_sem;
 
-bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys,
-		const char *hostnqn);
+bool nvmet_host_allowed(struct nvmet_subsys *subsys, const char *hostnqn);
 
 int nvmet_bdev_ns_enable(struct nvmet_ns *ns);
 int nvmet_file_ns_enable(struct nvmet_ns *ns);

From b662a078576e7d6e235b4e1b94863f0474cd8555 Mon Sep 17 00:00:00 2001
From: Jay Sternberg <jay.e.sternberg@intel.com>
Date: Mon, 12 Nov 2018 13:56:40 -0800
Subject: [PATCH 230/351] nvmet: enable Discovery Controller AENs

Add functions to find connections requesting Discovery Change events
and send a notification to hosts that maintain an explicit persistent
connection and have and active Asynchronous Event Request pending.
Only Hosts that have access to the Subsystem effected by the change
will receive notifications of Discovery Change event.

Call these functions each time there is a configfs change that effects
the Discover Log Pages.

Set the OAES field in the Identify Controller response to advertise the
support for Asynchronous Event Notifications.

Signed-off-by: Jay Sternberg <jay.e.sternberg@intel.com>
Reviewed-by: Phil Cayton <phil.cayton@intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c  | 30 +++++++++++----
 drivers/nvme/target/core.c      |  2 +-
 drivers/nvme/target/discovery.c | 68 +++++++++++++++++++++++++++++++--
 drivers/nvme/target/nvmet.h     | 11 +++++-
 4 files changed, 99 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d895579b6c5d..d37fd7713bbc 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -25,6 +25,9 @@
 static const struct config_item_type nvmet_host_type;
 static const struct config_item_type nvmet_subsys_type;
 
+static LIST_HEAD(nvmet_ports_list);
+struct list_head *nvmet_ports = &nvmet_ports_list;
+
 static const struct nvmet_transport_name {
 	u8		type;
 	const char	*name;
@@ -646,7 +649,8 @@ static int nvmet_port_subsys_allow_link(struct config_item *parent,
 	}
 
 	list_add_tail(&link->entry, &port->subsystems);
-	nvmet_genctr++;
+	nvmet_port_disc_changed(port, subsys);
+
 	up_write(&nvmet_config_sem);
 	return 0;
 
@@ -673,7 +677,8 @@ static void nvmet_port_subsys_drop_link(struct config_item *parent,
 
 found:
 	list_del(&p->entry);
-	nvmet_genctr++;
+	nvmet_port_disc_changed(port, subsys);
+
 	if (list_empty(&port->subsystems))
 		nvmet_disable_port(port);
 	up_write(&nvmet_config_sem);
@@ -722,7 +727,8 @@ static int nvmet_allowed_hosts_allow_link(struct config_item *parent,
 			goto out_free_link;
 	}
 	list_add_tail(&link->entry, &subsys->hosts);
-	nvmet_genctr++;
+	nvmet_subsys_disc_changed(subsys, host);
+
 	up_write(&nvmet_config_sem);
 	return 0;
 out_free_link:
@@ -748,7 +754,8 @@ static void nvmet_allowed_hosts_drop_link(struct config_item *parent,
 
 found:
 	list_del(&p->entry);
-	nvmet_genctr++;
+	nvmet_subsys_disc_changed(subsys, host);
+
 	up_write(&nvmet_config_sem);
 	kfree(p);
 }
@@ -787,7 +794,11 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
 		goto out_unlock;
 	}
 
-	subsys->allow_any_host = allow_any_host;
+	if (subsys->allow_any_host != allow_any_host) {
+		subsys->allow_any_host = allow_any_host;
+		nvmet_subsys_disc_changed(subsys, NULL);
+	}
+
 out_unlock:
 	up_write(&nvmet_config_sem);
 	return ret ? ret : count;
@@ -936,7 +947,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item,
 	if (enable)
 		nvmet_referral_enable(parent, port);
 	else
-		nvmet_referral_disable(port);
+		nvmet_referral_disable(parent, port);
 
 	return count;
 inval:
@@ -962,9 +973,10 @@ static struct configfs_attribute *nvmet_referral_attrs[] = {
 
 static void nvmet_referral_release(struct config_item *item)
 {
+	struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent);
 	struct nvmet_port *port = to_nvmet_port(item);
 
-	nvmet_referral_disable(port);
+	nvmet_referral_disable(parent, port);
 	kfree(port);
 }
 
@@ -1137,6 +1149,8 @@ static void nvmet_port_release(struct config_item *item)
 {
 	struct nvmet_port *port = to_nvmet_port(item);
 
+	list_del(&port->global_entry);
+
 	kfree(port->ana_state);
 	kfree(port);
 }
@@ -1189,6 +1203,8 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 			port->ana_state[i] = NVME_ANA_INACCESSIBLE;
 	}
 
+	list_add(&port->global_entry, &nvmet_ports_list);
+
 	INIT_LIST_HEAD(&port->entry);
 	INIT_LIST_HEAD(&port->subsystems);
 	INIT_LIST_HEAD(&port->referrals);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 849080b4115c..5aa5a3cc5395 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -130,7 +130,7 @@ static void nvmet_async_event_work(struct work_struct *work)
 	}
 }
 
-static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page)
 {
 	struct nvmet_async_event *aen;
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 5fbb1bb2c4fc..eb45ebc3ac2c 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -20,24 +20,82 @@ struct nvmet_subsys *nvmet_disc_subsys;
 
 u64 nvmet_genctr;
 
+static void __nvmet_disc_changed(struct nvmet_port *port,
+				 struct nvmet_ctrl *ctrl)
+{
+	if (ctrl->port != port)
+		return;
+
+	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_DISC_CHANGE))
+		return;
+
+	nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE,
+			      NVME_AER_NOTICE_DISC_CHANGED, NVME_LOG_DISC);
+}
+
+void nvmet_port_disc_changed(struct nvmet_port *port,
+			     struct nvmet_subsys *subsys)
+{
+	struct nvmet_ctrl *ctrl;
+
+	nvmet_genctr++;
+
+	list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+		if (subsys && !nvmet_host_allowed(subsys, ctrl->hostnqn))
+			continue;
+
+		__nvmet_disc_changed(port, ctrl);
+	}
+}
+
+static void __nvmet_subsys_disc_changed(struct nvmet_port *port,
+					struct nvmet_subsys *subsys,
+					struct nvmet_host *host)
+{
+	struct nvmet_ctrl *ctrl;
+
+	list_for_each_entry(ctrl, &nvmet_disc_subsys->ctrls, subsys_entry) {
+		if (host && strcmp(nvmet_host_name(host), ctrl->hostnqn))
+			continue;
+
+		__nvmet_disc_changed(port, ctrl);
+	}
+}
+
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+			       struct nvmet_host *host)
+{
+	struct nvmet_port *port;
+	struct nvmet_subsys_link *s;
+
+	nvmet_genctr++;
+
+	list_for_each_entry(port, nvmet_ports, global_entry)
+		list_for_each_entry(s, &port->subsystems, entry) {
+			if (s->subsys != subsys)
+				continue;
+			__nvmet_subsys_disc_changed(port, subsys, host);
+		}
+}
+
 void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port)
 {
 	down_write(&nvmet_config_sem);
 	if (list_empty(&port->entry)) {
 		list_add_tail(&port->entry, &parent->referrals);
 		port->enabled = true;
-		nvmet_genctr++;
+		nvmet_port_disc_changed(parent, NULL);
 	}
 	up_write(&nvmet_config_sem);
 }
 
-void nvmet_referral_disable(struct nvmet_port *port)
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port)
 {
 	down_write(&nvmet_config_sem);
 	if (!list_empty(&port->entry)) {
 		port->enabled = false;
 		list_del_init(&port->entry);
-		nvmet_genctr++;
+		nvmet_port_disc_changed(parent, NULL);
 	}
 	up_write(&nvmet_config_sem);
 }
@@ -136,6 +194,8 @@ static void nvmet_execute_get_disc_log_page(struct nvmet_req *req)
 	hdr->numrec = cpu_to_le64(numrec);
 	hdr->recfmt = cpu_to_le16(0);
 
+	nvmet_clear_aen_bit(req, NVME_AEN_BIT_DISC_CHANGE);
+
 	up_read(&nvmet_config_sem);
 
 	status = nvmet_copy_to_sgl(req, 0, hdr, data_len);
@@ -174,6 +234,8 @@ static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req)
 	if (req->port->inline_data_size)
 		id->sgls |= cpu_to_le32(1 << 20);
 
+	id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL);
+
 	strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
 
 	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 05b98f25d65c..31474940e373 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -139,6 +139,7 @@ struct nvmet_port {
 	struct list_head		subsystems;
 	struct config_group		referrals_group;
 	struct list_head		referrals;
+	struct list_head		global_entry;
 	struct config_group		ana_groups_group;
 	struct nvmet_ana_group		ana_default_group;
 	enum nvme_ana_state		*ana_state;
@@ -422,7 +423,7 @@ int nvmet_enable_port(struct nvmet_port *port);
 void nvmet_disable_port(struct nvmet_port *port);
 
 void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port);
-void nvmet_referral_disable(struct nvmet_port *port);
+void nvmet_referral_disable(struct nvmet_port *parent, struct nvmet_port *port);
 
 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
 		size_t len);
@@ -432,6 +433,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len);
 
 u32 nvmet_get_log_page_len(struct nvme_command *cmd);
 
+extern struct list_head *nvmet_ports;
+void nvmet_port_disc_changed(struct nvmet_port *port,
+		struct nvmet_subsys *subsys);
+void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
+		struct nvmet_host *host);
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+		u8 event_info, u8 log_page);
+
 #define NVMET_QUEUE_SIZE	1024
 #define NVMET_NR_QUEUES		128
 #define NVMET_MAX_CMD		NVMET_QUEUE_SIZE

From 03198c4d9fc8e72ba8a5ad74959b61de7f2780a6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 14 Nov 2018 16:46:23 +0100
Subject: [PATCH 231/351] nvmet: mark nvmet_genctr static

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/discovery.c | 2 +-
 drivers/nvme/target/nvmet.h     | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index eb45ebc3ac2c..4d8757ae8210 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -18,7 +18,7 @@
 
 struct nvmet_subsys *nvmet_disc_subsys;
 
-u64 nvmet_genctr;
+static u64 nvmet_genctr;
 
 static void __nvmet_disc_changed(struct nvmet_port *port,
 				 struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 31474940e373..03988fe9d915 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -470,7 +470,6 @@ int __init nvmet_init_discovery(void);
 void nvmet_exit_discovery(void);
 
 extern struct nvmet_subsys *nvmet_disc_subsys;
-extern u64 nvmet_genctr;
 extern struct rw_semaphore nvmet_config_sem;
 
 extern u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];

From 6e2e312ea7ff73acfafaa5c9851e151e9483c761 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Wed, 14 Nov 2018 15:57:46 -0800
Subject: [PATCH 232/351] nvmet-fc: remove the IN_ISR deferred scheduling
 options

All target lldd's call the cmd receive and op completions in non-isr
thread contexts. As such the IN_ISR options are not necessary.
Remove the functionality and flags, which also removes cpu assignments
to queues.

Signed-off-by: James Smart <jsmart2021@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/fc.c       | 66 ++--------------------------------
 include/linux/nvme-fc-driver.h | 16 ---------
 2 files changed, 2 insertions(+), 80 deletions(-)

diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 409081a03b24..f98f5c5bea26 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -86,8 +86,6 @@ struct nvmet_fc_fcp_iod {
 	spinlock_t			flock;
 
 	struct nvmet_req		req;
-	struct work_struct		work;
-	struct work_struct		done_work;
 	struct work_struct		defer_work;
 
 	struct nvmet_fc_tgtport		*tgtport;
@@ -134,7 +132,6 @@ struct nvmet_fc_tgt_queue {
 	u16				sqsize;
 	u16				ersp_ratio;
 	__le16				sqhd;
-	int				cpu;
 	atomic_t			connected;
 	atomic_t			sqtail;
 	atomic_t			zrspcnt;
@@ -232,8 +229,6 @@ static LIST_HEAD(nvmet_fc_portentry_list);
 
 
 static void nvmet_fc_handle_ls_rqst_work(struct work_struct *work);
-static void nvmet_fc_handle_fcp_rqst_work(struct work_struct *work);
-static void nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work);
 static void nvmet_fc_fcp_rqst_op_defer_work(struct work_struct *work);
 static void nvmet_fc_tgt_a_put(struct nvmet_fc_tgt_assoc *assoc);
 static int nvmet_fc_tgt_a_get(struct nvmet_fc_tgt_assoc *assoc);
@@ -438,8 +433,6 @@ nvmet_fc_prep_fcp_iodlist(struct nvmet_fc_tgtport *tgtport,
 	int i;
 
 	for (i = 0; i < queue->sqsize; fod++, i++) {
-		INIT_WORK(&fod->work, nvmet_fc_handle_fcp_rqst_work);
-		INIT_WORK(&fod->done_work, nvmet_fc_fcp_rqst_op_done_work);
 		INIT_WORK(&fod->defer_work, nvmet_fc_fcp_rqst_op_defer_work);
 		fod->tgtport = tgtport;
 		fod->queue = queue;
@@ -517,10 +510,7 @@ nvmet_fc_queue_fcp_req(struct nvmet_fc_tgtport *tgtport,
 	fcpreq->hwqid = queue->qid ?
 			((queue->qid - 1) % tgtport->ops->max_hw_queues) : 0;
 
-	if (tgtport->ops->target_features & NVMET_FCTGTFEAT_CMD_IN_ISR)
-		queue_work_on(queue->cpu, queue->work_q, &fod->work);
-	else
-		nvmet_fc_handle_fcp_rqst(tgtport, fod);
+	nvmet_fc_handle_fcp_rqst(tgtport, fod);
 }
 
 static void
@@ -599,30 +589,6 @@ nvmet_fc_free_fcp_iod(struct nvmet_fc_tgt_queue *queue,
 	queue_work(queue->work_q, &fod->defer_work);
 }
 
-static int
-nvmet_fc_queue_to_cpu(struct nvmet_fc_tgtport *tgtport, int qid)
-{
-	int cpu, idx, cnt;
-
-	if (tgtport->ops->max_hw_queues == 1)
-		return WORK_CPU_UNBOUND;
-
-	/* Simple cpu selection based on qid modulo active cpu count */
-	idx = !qid ? 0 : (qid - 1) % num_active_cpus();
-
-	/* find the n'th active cpu */
-	for (cpu = 0, cnt = 0; ; ) {
-		if (cpu_active(cpu)) {
-			if (cnt == idx)
-				break;
-			cnt++;
-		}
-		cpu = (cpu + 1) % num_possible_cpus();
-	}
-
-	return cpu;
-}
-
 static struct nvmet_fc_tgt_queue *
 nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 			u16 qid, u16 sqsize)
@@ -653,7 +619,6 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 	queue->qid = qid;
 	queue->sqsize = sqsize;
 	queue->assoc = assoc;
-	queue->cpu = nvmet_fc_queue_to_cpu(assoc->tgtport, qid);
 	INIT_LIST_HEAD(&queue->fod_list);
 	INIT_LIST_HEAD(&queue->avail_defer_list);
 	INIT_LIST_HEAD(&queue->pending_cmd_list);
@@ -2145,26 +2110,12 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod)
 	}
 }
 
-static void
-nvmet_fc_fcp_rqst_op_done_work(struct work_struct *work)
-{
-	struct nvmet_fc_fcp_iod *fod =
-		container_of(work, struct nvmet_fc_fcp_iod, done_work);
-
-	nvmet_fc_fod_op_done(fod);
-}
-
 static void
 nvmet_fc_xmt_fcp_op_done(struct nvmefc_tgt_fcp_req *fcpreq)
 {
 	struct nvmet_fc_fcp_iod *fod = fcpreq->nvmet_fc_private;
-	struct nvmet_fc_tgt_queue *queue = fod->queue;
 
-	if (fod->tgtport->ops->target_features & NVMET_FCTGTFEAT_OPDONE_IN_ISR)
-		/* context switch so completion is not in ISR context */
-		queue_work_on(queue->cpu, queue->work_q, &fod->done_work);
-	else
-		nvmet_fc_fod_op_done(fod);
+	nvmet_fc_fod_op_done(fod);
 }
 
 /*
@@ -2332,19 +2283,6 @@ transport_error:
 	nvmet_fc_abort_op(tgtport, fod);
 }
 
-/*
- * Actual processing routine for received FC-NVME LS Requests from the LLD
- */
-static void
-nvmet_fc_handle_fcp_rqst_work(struct work_struct *work)
-{
-	struct nvmet_fc_fcp_iod *fod =
-		container_of(work, struct nvmet_fc_fcp_iod, work);
-	struct nvmet_fc_tgtport *tgtport = fod->tgtport;
-
-	nvmet_fc_handle_fcp_rqst(tgtport, fod);
-}
-
 /**
  * nvmet_fc_rcv_fcp_req - transport entry point called by an LLDD
  *                       upon the reception of a NVME FCP CMD IU.
diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h
index f4ab3b1925ac..91745cc3704c 100644
--- a/include/linux/nvme-fc-driver.h
+++ b/include/linux/nvme-fc-driver.h
@@ -648,22 +648,6 @@ enum {
 		 * sequence in one LLDD operation. Errors during Data
 		 * sequence transmit must not allow RSP sequence to be sent.
 		 */
-	NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 1),
-		/* Bit 2: When 0, the LLDD is calling the cmd rcv handler
-		 * in a non-isr context, allowing the transport to finish
-		 * op completion in the calling context. When 1, the LLDD
-		 * is calling the cmd rcv handler in an ISR context,
-		 * requiring the transport to transition to a workqueue
-		 * for op completion.
-		 */
-	NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 2),
-		/* Bit 3: When 0, the LLDD is calling the op done handler
-		 * in a non-isr context, allowing the transport to finish
-		 * op completion in the calling context. When 1, the LLDD
-		 * is calling the op done handler in an ISR context,
-		 * requiring the transport to transition to a workqueue
-		 * for op completion.
-		 */
 };
 
 

From e6a622fd6d66b83779357e3400f487fc159a7d83 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:12 -0800
Subject: [PATCH 233/351] nvmet: support fabrics sq flow control

Technical proposal 8005 "fabrics SQ flow control" introduces a mode
where a host and controller agree to omit sq_head pointer updates
when sending nvme completions.

In case the host indicated desire to operate in this mode (connect attribute)
the controller will return back a connect completion with sq_head value
of 0xffff as indication that it will omit sq_head pointer updates.

This mode saves us an atomic update in the I/O path.

Reviewed-by: Hannes Reinecke <hare@suse.com>
[hch: suggested better implementation]
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c        | 23 +++++++++++++----------
 drivers/nvme/target/fabrics-cmd.c |  6 ++++++
 drivers/nvme/target/nvmet.h       |  1 +
 include/linux/nvme.h              |  4 ++++
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 5aa5a3cc5395..2df70010e9f2 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -597,26 +597,28 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
 	return ns;
 }
 
-static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+static void nvmet_update_sq_head(struct nvmet_req *req)
 {
-	u32 old_sqhd, new_sqhd;
-	u16 sqhd;
-
-	if (status)
-		nvmet_set_status(req, status);
-
 	if (req->sq->size) {
+		u32 old_sqhd, new_sqhd;
+
 		do {
 			old_sqhd = req->sq->sqhd;
 			new_sqhd = (old_sqhd + 1) % req->sq->size;
 		} while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
 					old_sqhd);
 	}
-	sqhd = req->sq->sqhd & 0x0000FFFF;
-	req->rsp->sq_head = cpu_to_le16(sqhd);
+	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
+}
+
+static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
+{
+	if (!req->sq->sqhd_disabled)
+		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
-
+	if (status)
+		nvmet_set_status(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 	req->ops->queue_response(req);
@@ -765,6 +767,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->sg_cnt = 0;
 	req->transfer_len = 0;
 	req->rsp->status = 0;
+	req->rsp->sq_head = 0;
 	req->ns = NULL;
 
 	/* no support for fused commands yet */
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index d84ae004cb85..328ae46d8344 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -115,6 +115,12 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 	/* note: convert queue size from 0's-based value to 1's-based value */
 	nvmet_cq_setup(ctrl, req->cq, qid, sqsize + 1);
 	nvmet_sq_setup(ctrl, req->sq, qid, sqsize + 1);
+
+	if (c->cattr & NVME_CONNECT_DISABLE_SQFLOW) {
+		req->sq->sqhd_disabled = true;
+		req->rsp->sq_head = cpu_to_le16(0xffff);
+	}
+
 	return 0;
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 03988fe9d915..547108c41ce9 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -106,6 +106,7 @@ struct nvmet_sq {
 	u16			qid;
 	u16			size;
 	u32			sqhd;
+	bool			sqhd_disabled;
 	struct completion	free_done;
 	struct completion	confirm_done;
 };
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 77d320d32ee5..e7d731776f62 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1044,6 +1044,10 @@ struct nvmf_disc_rsp_page_hdr {
 	struct nvmf_disc_rsp_page_entry entries[0];
 };
 
+enum {
+	NVME_CONNECT_DISABLE_SQFLOW	= (1 << 2),
+};
+
 struct nvmf_connect_command {
 	__u8		opcode;
 	__u8		resv1;

From 0445e1b5a2fed4612b7f72d9a56889c026b60aa9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:13 -0800
Subject: [PATCH 234/351] nvmet: don't override treq upon modification.

Only override the allowed parts of it.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
[hch: slight tweak to the NVME_TREQ_SECURE_CHANNEL_MASK definition]
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c | 11 +++++++----
 include/linux/nvme.h           |  2 ++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index d37fd7713bbc..260a401db01c 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -153,7 +153,8 @@ CONFIGFS_ATTR(nvmet_, addr_traddr);
 static ssize_t nvmet_addr_treq_show(struct config_item *item,
 		char *page)
 {
-	switch (to_nvmet_port(item)->disc_addr.treq) {
+	switch (to_nvmet_port(item)->disc_addr.treq &
+		NVME_TREQ_SECURE_CHANNEL_MASK) {
 	case NVMF_TREQ_NOT_SPECIFIED:
 		return sprintf(page, "not specified\n");
 	case NVMF_TREQ_REQUIRED:
@@ -169,6 +170,7 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
 		const char *page, size_t count)
 {
 	struct nvmet_port *port = to_nvmet_port(item);
+	u8 treq = port->disc_addr.treq & ~NVME_TREQ_SECURE_CHANNEL_MASK;
 
 	if (port->enabled) {
 		pr_err("Cannot modify address while enabled\n");
@@ -177,15 +179,16 @@ static ssize_t nvmet_addr_treq_store(struct config_item *item,
 	}
 
 	if (sysfs_streq(page, "not specified")) {
-		port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED;
+		treq |= NVMF_TREQ_NOT_SPECIFIED;
 	} else if (sysfs_streq(page, "required")) {
-		port->disc_addr.treq = NVMF_TREQ_REQUIRED;
+		treq |= NVMF_TREQ_REQUIRED;
 	} else if (sysfs_streq(page, "not required")) {
-		port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED;
+		treq |= NVMF_TREQ_NOT_REQUIRED;
 	} else {
 		pr_err("Invalid value '%s' for treq\n", page);
 		return -EINVAL;
 	}
+	port->disc_addr.treq = treq;
 
 	return count;
 }
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e7d731776f62..4fc48071e5ea 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -61,6 +61,8 @@ enum {
 	NVMF_TREQ_NOT_SPECIFIED	= 0,	/* Not specified */
 	NVMF_TREQ_REQUIRED	= 1,	/* Required */
 	NVMF_TREQ_NOT_REQUIRED	= 2,	/* Not Required */
+#define NVME_TREQ_SECURE_CHANNEL_MASK \
+	(NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED)
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS

From 9b95d2fb857f242aacbf4e205656818b0ef067e1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 20 Nov 2018 10:34:19 +0100
Subject: [PATCH 235/351] nvmet: expose support for fabrics SQ flow control
 disable in treq

Technical Proposal introduces an indication for SQ flow control
disable support. Expose it since we are able to operate in this mode.

Reviewed-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/configfs.c | 1 +
 include/linux/nvme.h           | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 260a401db01c..db2cb64be7ba 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1214,6 +1214,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
 	port->inline_data_size = -1;	/* < 0 == let the transport choose */
 
 	port->disc_addr.portid = cpu_to_le16(portid);
+	port->disc_addr.treq = NVMF_TREQ_DISABLE_SQFLOW;
 	config_group_init_type_name(&port->group, name, &nvmet_port_type);
 
 	config_group_init_type_name(&port->subsys_group,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4fc48071e5ea..c03973c215ad 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -58,11 +58,13 @@ enum {
 
 /* Transport Requirements codes for Discovery Log Page entry TREQ field */
 enum {
-	NVMF_TREQ_NOT_SPECIFIED	= 0,	/* Not specified */
-	NVMF_TREQ_REQUIRED	= 1,	/* Required */
-	NVMF_TREQ_NOT_REQUIRED	= 2,	/* Not Required */
+	NVMF_TREQ_NOT_SPECIFIED	= 0,		/* Not specified */
+	NVMF_TREQ_REQUIRED	= 1,		/* Required */
+	NVMF_TREQ_NOT_REQUIRED	= 2,		/* Not Required */
 #define NVME_TREQ_SECURE_CHANNEL_MASK \
 	(NVMF_TREQ_REQUIRED | NVMF_TREQ_NOT_REQUIRED)
+
+	NVMF_TREQ_DISABLE_SQFLOW = (1 << 2),	/* Supports SQ flow control disable */
 };
 
 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS

From 8154ed730bc64f68bc28feb20e641c2e8a0eeba5 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Mon, 19 Nov 2018 14:11:15 -0800
Subject: [PATCH 236/351] nvme: disable fabrics SQ flow control when asked by
 the user

As for now, we don't care about sq_head pointer updates anyway, so
at least allow the controller to micro-optimize by omiting this update.

Note that we will probably need to support it when a controller
that requires this comes along.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fabrics.c | 13 ++++++++++++-
 drivers/nvme/host/fabrics.h |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index bd0969db6225..10074ac7731b 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -392,6 +392,9 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl)
 	cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 :
 		cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000);
 
+	if (ctrl->opts->disable_sqflow)
+		cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -451,6 +454,9 @@ int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid)
 	cmd.connect.qid = cpu_to_le16(qid);
 	cmd.connect.sqsize = cpu_to_le16(ctrl->sqsize);
 
+	if (ctrl->opts->disable_sqflow)
+		cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW;
+
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -607,6 +613,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOST_TRADDR,		"host_traddr=%s"	},
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
+	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -817,6 +824,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DUP_CONNECT:
 			opts->duplicate_connect = true;
 			break;
+		case NVMF_OPT_DISABLE_SQFLOW:
+			opts->disable_sqflow = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
@@ -933,7 +943,8 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
 #define NVMF_REQUIRED_OPTS	(NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
 #define NVMF_ALLOWED_OPTS	(NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
 				 NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \
-				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT)
+				 NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
+				 NVMF_OPT_DISABLE_SQFLOW)
 
 static struct nvme_ctrl *
 nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 6ea6275f332a..ecd9a006a091 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -58,6 +58,7 @@ enum {
 	NVMF_OPT_CTRL_LOSS_TMO	= 1 << 11,
 	NVMF_OPT_HOST_ID	= 1 << 12,
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
+	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 };
 
 /**
@@ -101,6 +102,7 @@ struct nvmf_ctrl_options {
 	unsigned int		kato;
 	struct nvmf_host	*host;
 	int			max_reconnects;
+	bool			disable_sqflow;
 };
 
 /*

From 5c4072ad1c151c65c3d60f95536786042cd49e29 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 19 Nov 2018 10:58:52 +0000
Subject: [PATCH 237/351] nvme: Remove unused forward declaration

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 48ffb1d685c2..71d2a89bbd1d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -97,7 +97,6 @@ static dev_t nvme_chr_devt;
 static struct class *nvme_class;
 static struct class *nvme_subsys_class;
 
-static void nvme_ns_remove(struct nvme_ns *ns);
 static int nvme_revalidate_disk(struct gendisk *disk);
 static void nvme_put_subsystem(struct nvme_subsystem *subsys);
 static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,

From ad1f824948e4ed886529219cf7cd717d078c630d Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 19 Nov 2018 10:58:51 +0000
Subject: [PATCH 238/351] nvmet-rdma: Add unlikely for response allocated check

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/rdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3f7971d3706d..932242e27b03 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -196,7 +196,7 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 {
 	unsigned long flags;
 
-	if (rsp->allocated) {
+	if (unlikely(rsp->allocated)) {
 		kfree(rsp);
 		return;
 	}

From cb019da3dabf60d792f76a913540815f06abb1d7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 19 Nov 2018 13:35:30 -0800
Subject: [PATCH 239/351] nvmet: use unlikely for req status check

This patch adds unlikely in the nvmet request completion path for the
status check in the low level function __nvmet_req_complete.
This is helpful in the scenario where host and target connection is
working smoothly.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 2df70010e9f2..e468100b9211 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -617,7 +617,7 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
-	if (status)
+	if (unlikely(status))
 		nvmet_set_status(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);

From 5a3a6d6965865d0da5c743a0f9c58f84373f88e7 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Mon, 19 Nov 2018 15:16:39 -0800
Subject: [PATCH 240/351] nvmet: fix the structure member indentation

This is a cleanup patch which fixes the structure member indentation
introduced by the p2p:

commit c6925093d0b2 ("nvmet: Optionally use PCI P2P memory").
We don't change any functionality in this patch.

This is needed so that any future members will also follow the uniform
indentation.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Acked-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/nvmet.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 547108c41ce9..7d8b7a7d572a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -200,8 +200,8 @@ struct nvmet_ctrl {
 	char			subsysnqn[NVMF_NQN_FIELD_LEN];
 	char			hostnqn[NVMF_NQN_FIELD_LEN];
 
-	struct device *p2p_client;
-	struct radix_tree_root p2p_ns_map;
+	struct device		*p2p_client;
+	struct radix_tree_root	p2p_ns_map;
 };
 
 struct nvmet_subsys {
@@ -314,8 +314,8 @@ struct nvmet_req {
 	void (*execute)(struct nvmet_req *req);
 	const struct nvmet_fabrics_ops *ops;
 
-	struct pci_dev *p2p_dev;
-	struct device *p2p_client;
+	struct pci_dev		*p2p_dev;
+	struct device		*p2p_client;
 };
 
 extern struct workqueue_struct *buffered_io_wq;

From 49cd84b6f8b677ef45731ed56ddb802cdbb94c9e Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 27 Nov 2018 09:40:57 -0700
Subject: [PATCH 241/351] nvme: implement Enhanced Command Retry

A controller may have an internal state that is not able to successfully
process commands for a short duration. In such states, an immediate
command requeue is expected to fail. The driver may exceed its max
retry count, which permanently ends the command in failure when the same
command would succeed after waiting for the controller to be ready.

NVMe ratified TP 4033 provides a delay hint in the completion status
code for failed commands. Implement the retry delay based on the command
completion status and the controller's requested delay.

Note that requeued commands are handled per request_queue, not per
individual request. If multiple commands fail, the controller should
consistently report the desired delay time for retryable commands in
all CQEs, otherwise the requeue list may be kicked too soon.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 47 ++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 include/linux/nvme.h     | 17 ++++++++++++++-
 3 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 71d2a89bbd1d..f90576862736 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -244,6 +244,22 @@ static inline bool nvme_req_needs_retry(struct request *req)
 	return true;
 }
 
+static void nvme_retry_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	unsigned long delay = 0;
+	u16 crd;
+
+	/* The mask and shift result must be <= 3 */
+	crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;
+	if (ns && crd)
+		delay = ns->ctrl->crdt[crd - 1] * 100;
+
+	nvme_req(req)->retries++;
+	blk_mq_requeue_request(req, false);
+	blk_mq_delay_kick_requeue_list(req->q, delay);
+}
+
 void nvme_complete_rq(struct request *req)
 {
 	blk_status_t status = nvme_error_status(req);
@@ -261,8 +277,7 @@ void nvme_complete_rq(struct request *req)
 		}
 
 		if (!blk_queue_dying(req->q)) {
-			nvme_req(req)->retries++;
-			blk_mq_requeue_request(req, true);
+			nvme_retry_req(req);
 			return;
 		}
 	}
@@ -1883,6 +1898,26 @@ static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
 	return ret;
 }
 
+static int nvme_configure_acre(struct nvme_ctrl *ctrl)
+{
+	struct nvme_feat_host_behavior *host;
+	int ret;
+
+	/* Don't bother enabling the feature if retry delay is not reported */
+	if (!ctrl->crdt[0])
+		return 0;
+
+	host = kzalloc(sizeof(*host), GFP_KERNEL);
+	if (!host)
+		return 0;
+
+	host->acre = NVME_ENABLE_ACRE;
+	ret = nvme_set_features(ctrl, NVME_FEAT_HOST_BEHAVIOR, 0,
+				host, sizeof(*host), NULL);
+	kfree(host);
+	return ret;
+}
+
 static int nvme_configure_apst(struct nvme_ctrl *ctrl)
 {
 	/*
@@ -2404,6 +2439,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 		ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
 	}
 
+	ctrl->crdt[0] = le16_to_cpu(id->crdt1);
+	ctrl->crdt[1] = le16_to_cpu(id->crdt2);
+	ctrl->crdt[2] = le16_to_cpu(id->crdt3);
+
 	ctrl->oacs = le16_to_cpu(id->oacs);
 	ctrl->oncs = le16_to_cpup(&id->oncs);
 	ctrl->oaes = le32_to_cpu(id->oaes);
@@ -2504,6 +2543,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	if (ret < 0)
 		return ret;
 
+	ret = nvme_configure_acre(ctrl);
+	if (ret < 0)
+		return ret;
+
 	ctrl->identified = true;
 
 	return 0;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f2594d468f29..79e621f5b326 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -181,6 +181,7 @@ struct nvme_ctrl {
 	u32 page_size;
 	u32 max_hw_sectors;
 	u32 max_segments;
+	u16 crdt[3];
 	u16 oncs;
 	u16 oacs;
 	u16 nssa;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index c03973c215ad..88812cb15be0 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -223,7 +223,11 @@ struct nvme_id_ctrl {
 	__le32			rtd3e;
 	__le32			oaes;
 	__le32			ctratt;
-	__u8			rsvd100[156];
+	__u8			rsvd100[28];
+	__le16			crdt1;
+	__le16			crdt2;
+	__le16			crdt3;
+	__u8			rsvd134[122];
 	__le16			oacs;
 	__u8			acl;
 	__u8			aerl;
@@ -756,6 +760,15 @@ enum {
 	NVME_HOST_MEM_RETURN	= (1 << 1),
 };
 
+struct nvme_feat_host_behavior {
+	__u8 acre;
+	__u8 resv1[511];
+};
+
+enum {
+	NVME_ENABLE_ACRE	= 1,
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
@@ -810,6 +823,7 @@ enum {
 	NVME_FEAT_RRL		= 0x12,
 	NVME_FEAT_PLM_CONFIG	= 0x13,
 	NVME_FEAT_PLM_WINDOW	= 0x14,
+	NVME_FEAT_HOST_BEHAVIOR	= 0x16,
 	NVME_FEAT_SW_PROGRESS	= 0x80,
 	NVME_FEAT_HOST_ID	= 0x81,
 	NVME_FEAT_RESV_MASK	= 0x82,
@@ -1265,6 +1279,7 @@ enum {
 	NVME_SC_ANA_TRANSITION		= 0x303,
 	NVME_SC_HOST_PATH_ERROR		= 0x370,
 
+	NVME_SC_CRD			= 0x1800,
 	NVME_SC_DNR			= 0x4000,
 };
 

From 3236b458c475524d0735f6dd0bd250478434c7b1 Mon Sep 17 00:00:00 2001
From: Israel Rukshin <israelr@mellanox.com>
Date: Mon, 3 Dec 2018 15:50:05 +0000
Subject: [PATCH 242/351] nvme: remove unused function nvme_ctrl_ready

Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/nvme.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 79e621f5b326..8e0ec365ce8d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -368,15 +368,6 @@ static inline void nvme_fault_inject_fini(struct nvme_ns *ns) {}
 static inline void nvme_should_fail(struct request *req) {}
 #endif
 
-static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl)
-{
-	u32 val = 0;
-
-	if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val))
-		return false;
-	return val & NVME_CSTS_RDY;
-}
-
 static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
 {
 	if (!ctrl->subsystem)

From 29cadd2bb6670405086b177120593c1291273fb9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 7 Dec 2018 22:06:12 -0700
Subject: [PATCH 243/351] scsi: Fix a harmless double shift bug

Smatch generates a warning:

    drivers/scsi/scsi_lib.c:1656 scsi_mq_done() warn: test_bit() takes a bit number

The problem is that SCMD_STATE_COMPLETE is supposed to be bit number 0
and not a mask like "(1 << 0)".  It is used like this:

        if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state))

The test_and_set_bit() has a shift built in so it's a double left shift
and uses bit number 1 instead of number 0.  This bug is harmless because
it's done consistently and it doesn't clash with any other flags.

Fixes: f1342709d18a ("scsi: Do not rely on blk-mq for double completions")
Reviewed-by: Keith Busch <keith.busch@intel.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/scsi/scsi_cmnd.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 3de905e205ce..d85e6befa26b 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -62,7 +62,7 @@ struct scsi_pointer {
 #define SCMD_PRESERVED_FLAGS	(SCMD_UNCHECKED_ISA_DMA | SCMD_INITIALIZED)
 
 /* for scmd->state */
-#define SCMD_STATE_COMPLETE	(1 << 0)
+#define SCMD_STATE_COMPLETE	0
 
 struct scsi_cmnd {
 	struct scsi_request req;

From 58ab5e32e6fd83e33943614e7257f2ac5823824a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 9 Dec 2018 17:43:20 -0700
Subject: [PATCH 244/351] sbitmap: silence bogus lockdep IRQ warning

Ming reports that lockdep spews the following trace. What this
essentially says is that the sbitmap swap_lock was used inconsistently
in IRQ enabled and disabled context, and that is usually indicative of a
bug that will cause a deadlock.

For this case, it's a false positive. The swap_lock is used from process
context only, when we swap the bits in the word and cleared mask. We
also end up doing that when we are getting a driver tag, from the
blk_mq_mark_tag_wait(), and from there we hold the waitqueue lock with
IRQs disabled. However, this isn't from an actual IRQ, it's still
process context.

In lieu of a better way to fix this, simply always disable interrupts
when grabbing the swap_lock if lockdep is enabled.

[  100.967642] ================start test sanity/001================
[  101.238280] null: module loaded
[  106.093735]
[  106.094012] =====================================================
[  106.094854] WARNING: SOFTIRQ-safe -> SOFTIRQ-unsafe lock order detected
[  106.095759] 4.20.0-rc3_5d2ee7122c73_for-next+ #1 Not tainted
[  106.096551] -----------------------------------------------------
[  106.097386] fio/1043 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
[  106.098231] 000000004c43fa71
(&(&sb->map[i].swap_lock)->rlock){+.+.}, at: sbitmap_get+0xd5/0x22c
[  106.099431]
[  106.099431] and this task is already holding:
[  106.100229] 000000007eec8b2f
(&(&hctx->dispatch_wait_lock)->rlock){....}, at:
blk_mq_dispatch_rq_list+0x4c1/0xd7c
[  106.101630] which would create a new lock dependency:
[  106.102326]  (&(&hctx->dispatch_wait_lock)->rlock){....} ->
(&(&sb->map[i].swap_lock)->rlock){+.+.}
[  106.103553]
[  106.103553] but this new dependency connects a SOFTIRQ-irq-safe lock:
[  106.104580]  (&sbq->ws[i].wait){..-.}
[  106.104582]
[  106.104582] ... which became SOFTIRQ-irq-safe at:
[  106.105751]   _raw_spin_lock_irqsave+0x4b/0x82
[  106.106284]   __wake_up_common_lock+0x119/0x1b9
[  106.106825]   sbitmap_queue_wake_up+0x33f/0x383
[  106.107456]   sbitmap_queue_clear+0x4c/0x9a
[  106.108046]   __blk_mq_free_request+0x188/0x1d3
[  106.108581]   blk_mq_free_request+0x23b/0x26b
[  106.109102]   scsi_end_request+0x345/0x5d7
[  106.109587]   scsi_io_completion+0x4b5/0x8f0
[  106.110099]   scsi_finish_command+0x412/0x456
[  106.110615]   scsi_softirq_done+0x23f/0x29b
[  106.111115]   blk_done_softirq+0x2a7/0x2e6
[  106.111608]   __do_softirq+0x360/0x6ad
[  106.112062]   run_ksoftirqd+0x2f/0x5b
[  106.112499]   smpboot_thread_fn+0x3a5/0x3db
[  106.113000]   kthread+0x1d4/0x1e4
[  106.113457]   ret_from_fork+0x3a/0x50
[  106.113969]
[  106.113969] to a SOFTIRQ-irq-unsafe lock:
[  106.114672]  (&(&sb->map[i].swap_lock)->rlock){+.+.}
[  106.114674]
[  106.114674] ... which became SOFTIRQ-irq-unsafe at:
[  106.116000] ...
[  106.116003]   _raw_spin_lock+0x33/0x64
[  106.116676]   sbitmap_get+0xd5/0x22c
[  106.117134]   __sbitmap_queue_get+0xe8/0x177
[  106.117731]   __blk_mq_get_tag+0x1e6/0x22d
[  106.118286]   blk_mq_get_tag+0x1db/0x6e4
[  106.118756]   blk_mq_get_driver_tag+0x161/0x258
[  106.119383]   blk_mq_dispatch_rq_list+0x28e/0xd7c
[  106.120043]   blk_mq_do_dispatch_sched+0x23a/0x287
[  106.120607]   blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.121234]   __blk_mq_run_hw_queue+0x137/0x17e
[  106.121781]   __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.122366]   blk_mq_run_hw_queue+0x151/0x187
[  106.122887]   blk_mq_sched_insert_requests+0x13f/0x175
[  106.123492]   blk_mq_flush_plug_list+0x7d6/0x81b
[  106.124042]   blk_flush_plug_list+0x392/0x3d7
[  106.124557]   blk_finish_plug+0x37/0x4f
[  106.125019]   read_pages+0x3ef/0x430
[  106.125446]   __do_page_cache_readahead+0x18e/0x2fc
[  106.126027]   force_page_cache_readahead+0x121/0x133
[  106.126621]   page_cache_sync_readahead+0x35f/0x3bb
[  106.127229]   generic_file_buffered_read+0x410/0x1860
[  106.127932]   __vfs_read+0x319/0x38f
[  106.128415]   vfs_read+0xd2/0x19a
[  106.128817]   ksys_read+0xb9/0x135
[  106.129225]   do_syscall_64+0x140/0x385
[  106.129684]   entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.130292]
[  106.130292] other info that might help us debug this:
[  106.130292]
[  106.131226] Chain exists of:
[  106.131226]   &sbq->ws[i].wait -->
&(&hctx->dispatch_wait_lock)->rlock -->
&(&sb->map[i].swap_lock)->rlock
[  106.131226]
[  106.132865]  Possible interrupt unsafe locking scenario:
[  106.132865]
[  106.133659]        CPU0                    CPU1
[  106.134194]        ----                    ----
[  106.134733]   lock(&(&sb->map[i].swap_lock)->rlock);
[  106.135318]                                local_irq_disable();
[  106.136014]                                lock(&sbq->ws[i].wait);
[  106.136747]
lock(&(&hctx->dispatch_wait_lock)->rlock);
[  106.137742]   <Interrupt>
[  106.138110]     lock(&sbq->ws[i].wait);
[  106.138625]
[  106.138625]  *** DEADLOCK ***
[  106.138625]
[  106.139430] 3 locks held by fio/1043:
[  106.139947]  #0: 0000000076ff0fd9 (rcu_read_lock){....}, at:
hctx_lock+0x29/0xe8
[  106.140813]  #1: 000000002feb1016 (&sbq->ws[i].wait){..-.}, at:
blk_mq_dispatch_rq_list+0x4ad/0xd7c
[  106.141877]  #2: 000000007eec8b2f
(&(&hctx->dispatch_wait_lock)->rlock){....}, at:
blk_mq_dispatch_rq_list+0x4c1/0xd7c
[  106.143267]
[  106.143267] the dependencies between SOFTIRQ-irq-safe lock and the
holding lock:
[  106.144351]  -> (&sbq->ws[i].wait){..-.} ops: 82 {
[  106.144926]     IN-SOFTIRQ-W at:
[  106.145314]                       _raw_spin_lock_irqsave+0x4b/0x82
[  106.146042]                       __wake_up_common_lock+0x119/0x1b9
[  106.146785]                       sbitmap_queue_wake_up+0x33f/0x383
[  106.147567]                       sbitmap_queue_clear+0x4c/0x9a
[  106.148379]                       __blk_mq_free_request+0x188/0x1d3
[  106.149148]                       blk_mq_free_request+0x23b/0x26b
[  106.149864]                       scsi_end_request+0x345/0x5d7
[  106.150546]                       scsi_io_completion+0x4b5/0x8f0
[  106.151367]                       scsi_finish_command+0x412/0x456
[  106.152157]                       scsi_softirq_done+0x23f/0x29b
[  106.152855]                       blk_done_softirq+0x2a7/0x2e6
[  106.153537]                       __do_softirq+0x360/0x6ad
[  106.154280]                       run_ksoftirqd+0x2f/0x5b
[  106.155020]                       smpboot_thread_fn+0x3a5/0x3db
[  106.155828]                       kthread+0x1d4/0x1e4
[  106.156526]                       ret_from_fork+0x3a/0x50
[  106.157267]     INITIAL USE at:
[  106.157713]                      _raw_spin_lock_irqsave+0x4b/0x82
[  106.158542]                      prepare_to_wait_exclusive+0xa8/0x215
[  106.159421]                      blk_mq_get_tag+0x34f/0x6e4
[  106.160186]                      blk_mq_get_request+0x48e/0xaef
[  106.160997]                      blk_mq_make_request+0x27e/0xbd2
[  106.161828]                      generic_make_request+0x4d1/0x873
[  106.162661]                      submit_bio+0x20c/0x253
[  106.163379]                      mpage_bio_submit+0x44/0x4b
[  106.164142]                      mpage_readpages+0x3c2/0x407
[  106.164919]                      read_pages+0x13a/0x430
[  106.165633]                      __do_page_cache_readahead+0x18e/0x2fc
[  106.166530]                      force_page_cache_readahead+0x121/0x133
[  106.167439]                      page_cache_sync_readahead+0x35f/0x3bb
[  106.168337]                      generic_file_buffered_read+0x410/0x1860
[  106.169255]                      __vfs_read+0x319/0x38f
[  106.169977]                      vfs_read+0xd2/0x19a
[  106.170662]                      ksys_read+0xb9/0x135
[  106.171356]                      do_syscall_64+0x140/0x385
[  106.172120]                      entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.173051]   }
[  106.173308]   ... key      at: [<ffffffff85094600>] __key.26481+0x0/0x40
[  106.174219]   ... acquired at:
[  106.174646]    _raw_spin_lock+0x33/0x64
[  106.175183]    blk_mq_dispatch_rq_list+0x4c1/0xd7c
[  106.175843]    blk_mq_do_dispatch_sched+0x23a/0x287
[  106.176518]    blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.177262]    __blk_mq_run_hw_queue+0x137/0x17e
[  106.177900]    __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.178591]    blk_mq_run_hw_queue+0x151/0x187
[  106.179207]    blk_mq_sched_insert_requests+0x13f/0x175
[  106.179926]    blk_mq_flush_plug_list+0x7d6/0x81b
[  106.180571]    blk_flush_plug_list+0x392/0x3d7
[  106.181187]    blk_finish_plug+0x37/0x4f
[  106.181737]    __se_sys_io_submit+0x171/0x304
[  106.182346]    do_syscall_64+0x140/0x385
[  106.182895]    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.183607]
[  106.183830] -> (&(&hctx->dispatch_wait_lock)->rlock){....} ops: 1 {
[  106.184691]    INITIAL USE at:
[  106.185119]                    _raw_spin_lock+0x33/0x64
[  106.185838]                    blk_mq_dispatch_rq_list+0x4c1/0xd7c
[  106.186697]                    blk_mq_do_dispatch_sched+0x23a/0x287
[  106.187551]                    blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.188481]                    __blk_mq_run_hw_queue+0x137/0x17e
[  106.189307]                    __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.190189]                    blk_mq_run_hw_queue+0x151/0x187
[  106.190989]                    blk_mq_sched_insert_requests+0x13f/0x175
[  106.191902]                    blk_mq_flush_plug_list+0x7d6/0x81b
[  106.192739]                    blk_flush_plug_list+0x392/0x3d7
[  106.193535]                    blk_finish_plug+0x37/0x4f
[  106.194269]                    __se_sys_io_submit+0x171/0x304
[  106.195059]                    do_syscall_64+0x140/0x385
[  106.195794]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.196705]  }
[  106.196950]  ... key      at: [<ffffffff84880620>] __key.51231+0x0/0x40
[  106.197853]  ... acquired at:
[  106.198270]    lock_acquire+0x280/0x2f3
[  106.198806]    _raw_spin_lock+0x33/0x64
[  106.199337]    sbitmap_get+0xd5/0x22c
[  106.199850]    __sbitmap_queue_get+0xe8/0x177
[  106.200450]    __blk_mq_get_tag+0x1e6/0x22d
[  106.201035]    blk_mq_get_tag+0x1db/0x6e4
[  106.201589]    blk_mq_get_driver_tag+0x161/0x258
[  106.202237]    blk_mq_dispatch_rq_list+0x5b9/0xd7c
[  106.202902]    blk_mq_do_dispatch_sched+0x23a/0x287
[  106.203572]    blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.204316]    __blk_mq_run_hw_queue+0x137/0x17e
[  106.204956]    __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.205649]    blk_mq_run_hw_queue+0x151/0x187
[  106.206269]    blk_mq_sched_insert_requests+0x13f/0x175
[  106.206997]    blk_mq_flush_plug_list+0x7d6/0x81b
[  106.207644]    blk_flush_plug_list+0x392/0x3d7
[  106.208264]    blk_finish_plug+0x37/0x4f
[  106.208814]    __se_sys_io_submit+0x171/0x304
[  106.209415]    do_syscall_64+0x140/0x385
[  106.209965]    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.210684]
[  106.210904]
[  106.210904] the dependencies between the lock to be acquired
[  106.210905]  and SOFTIRQ-irq-unsafe lock:
[  106.212541] -> (&(&sb->map[i].swap_lock)->rlock){+.+.} ops: 1969 {
[  106.213393]    HARDIRQ-ON-W at:
[  106.213840]                     _raw_spin_lock+0x33/0x64
[  106.214570]                     sbitmap_get+0xd5/0x22c
[  106.215282]                     __sbitmap_queue_get+0xe8/0x177
[  106.216086]                     __blk_mq_get_tag+0x1e6/0x22d
[  106.216876]                     blk_mq_get_tag+0x1db/0x6e4
[  106.217627]                     blk_mq_get_driver_tag+0x161/0x258
[  106.218465]                     blk_mq_dispatch_rq_list+0x28e/0xd7c
[  106.219326]                     blk_mq_do_dispatch_sched+0x23a/0x287
[  106.220198]                     blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.221138]                     __blk_mq_run_hw_queue+0x137/0x17e
[  106.221975]                     __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.222874]                     blk_mq_run_hw_queue+0x151/0x187
[  106.223686]                     blk_mq_sched_insert_requests+0x13f/0x175
[  106.224597]                     blk_mq_flush_plug_list+0x7d6/0x81b
[  106.225444]                     blk_flush_plug_list+0x392/0x3d7
[  106.226255]                     blk_finish_plug+0x37/0x4f
[  106.227006]                     read_pages+0x3ef/0x430
[  106.227717]                     __do_page_cache_readahead+0x18e/0x2fc
[  106.228595]                     force_page_cache_readahead+0x121/0x133
[  106.229491]                     page_cache_sync_readahead+0x35f/0x3bb
[  106.230373]                     generic_file_buffered_read+0x410/0x1860
[  106.231277]                     __vfs_read+0x319/0x38f
[  106.231986]                     vfs_read+0xd2/0x19a
[  106.232666]                     ksys_read+0xb9/0x135
[  106.233350]                     do_syscall_64+0x140/0x385
[  106.234097]                     entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.235012]    SOFTIRQ-ON-W at:
[  106.235460]                     _raw_spin_lock+0x33/0x64
[  106.236195]                     sbitmap_get+0xd5/0x22c
[  106.236913]                     __sbitmap_queue_get+0xe8/0x177
[  106.237715]                     __blk_mq_get_tag+0x1e6/0x22d
[  106.238488]                     blk_mq_get_tag+0x1db/0x6e4
[  106.239244]                     blk_mq_get_driver_tag+0x161/0x258
[  106.240079]                     blk_mq_dispatch_rq_list+0x28e/0xd7c
[  106.240937]                     blk_mq_do_dispatch_sched+0x23a/0x287
[  106.241806]                     blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.242751]                     __blk_mq_run_hw_queue+0x137/0x17e
[  106.243579]                     __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.244469]                     blk_mq_run_hw_queue+0x151/0x187
[  106.245277]                     blk_mq_sched_insert_requests+0x13f/0x175
[  106.246191]                     blk_mq_flush_plug_list+0x7d6/0x81b
[  106.247044]                     blk_flush_plug_list+0x392/0x3d7
[  106.247859]                     blk_finish_plug+0x37/0x4f
[  106.248749]                     read_pages+0x3ef/0x430
[  106.249463]                     __do_page_cache_readahead+0x18e/0x2fc
[  106.250357]                     force_page_cache_readahead+0x121/0x133
[  106.251263]                     page_cache_sync_readahead+0x35f/0x3bb
[  106.252157]                     generic_file_buffered_read+0x410/0x1860
[  106.253084]                     __vfs_read+0x319/0x38f
[  106.253808]                     vfs_read+0xd2/0x19a
[  106.254488]                     ksys_read+0xb9/0x135
[  106.255186]                     do_syscall_64+0x140/0x385
[  106.255943]                     entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.256867]    INITIAL USE at:
[  106.257300]                    _raw_spin_lock+0x33/0x64
[  106.258033]                    sbitmap_get+0xd5/0x22c
[  106.258747]                    __sbitmap_queue_get+0xe8/0x177
[  106.259542]                    __blk_mq_get_tag+0x1e6/0x22d
[  106.260320]                    blk_mq_get_tag+0x1db/0x6e4
[  106.261072]                    blk_mq_get_driver_tag+0x161/0x258
[  106.261902]                    blk_mq_dispatch_rq_list+0x28e/0xd7c
[  106.262762]                    blk_mq_do_dispatch_sched+0x23a/0x287
[  106.263626]                    blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.264571]                    __blk_mq_run_hw_queue+0x137/0x17e
[  106.265409]                    __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.266302]                    blk_mq_run_hw_queue+0x151/0x187
[  106.267111]                    blk_mq_sched_insert_requests+0x13f/0x175
[  106.268028]                    blk_mq_flush_plug_list+0x7d6/0x81b
[  106.268878]                    blk_flush_plug_list+0x392/0x3d7
[  106.269694]                    blk_finish_plug+0x37/0x4f
[  106.270432]                    read_pages+0x3ef/0x430
[  106.271139]                    __do_page_cache_readahead+0x18e/0x2fc
[  106.272040]                    force_page_cache_readahead+0x121/0x133
[  106.272932]                    page_cache_sync_readahead+0x35f/0x3bb
[  106.273811]                    generic_file_buffered_read+0x410/0x1860
[  106.274709]                    __vfs_read+0x319/0x38f
[  106.275407]                    vfs_read+0xd2/0x19a
[  106.276074]                    ksys_read+0xb9/0x135
[  106.276764]                    do_syscall_64+0x140/0x385
[  106.277500]                    entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  106.278417]  }
[  106.278676]  ... key      at: [<ffffffff85094640>] __key.26212+0x0/0x40
[  106.279586]  ... acquired at:
[  106.280026]    lock_acquire+0x280/0x2f3
[  106.280559]    _raw_spin_lock+0x33/0x64
[  106.281101]    sbitmap_get+0xd5/0x22c
[  106.281610]    __sbitmap_queue_get+0xe8/0x177
[  106.282221]    __blk_mq_get_tag+0x1e6/0x22d
[  106.282809]    blk_mq_get_tag+0x1db/0x6e4
[  106.283368]    blk_mq_get_driver_tag+0x161/0x258
[  106.284018]    blk_mq_dispatch_rq_list+0x5b9/0xd7c
[  106.284685]    blk_mq_do_dispatch_sched+0x23a/0x287
[  106.285371]    blk_mq_sched_dispatch_requests+0x379/0x3fc
[  106.286135]    __blk_mq_run_hw_queue+0x137/0x17e
[  106.286806]    __blk_mq_delay_run_hw_queue+0x80/0x25f
[  106.287515]    blk_mq_run_hw_queue+0x151/0x187
[  106.288149]    blk_mq_sched_insert_requests+0x13f/0x175
[  106.289041]    blk_mq_flush_plug_list+0x7d6/0x81b
[  106.289912]    blk_flush_plug_list+0x392/0x3d7
[  106.290590]    blk_finish_plug+0x37/0x4f
[  106.291238]    __se_sys_io_submit+0x171/0x304
[  106.291864]    do_syscall_64+0x140/0x385
[  106.292534]    entry_SYSCALL_64_after_hwframe+0x49/0xbe

Reported-by: Ming Lei <ming.lei@redhat.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/sbitmap.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index a89fbe7cf6ca..2261136ae067 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -118,8 +118,13 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
 {
 	unsigned long mask, val;
+	unsigned long __maybe_unused flags;
 	bool ret = false;
 
+	/* Silence bogus lockdep warning */
+#if defined(CONFIG_LOCKDEP)
+	local_irq_save(flags);
+#endif
 	spin_lock(&sb->map[index].swap_lock);
 
 	if (!sb->map[index].cleared)
@@ -142,6 +147,9 @@ static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
 	ret = true;
 out_unlock:
 	spin_unlock(&sb->map[index].swap_lock);
+#if defined(CONFIG_LOCKDEP)
+	local_irq_restore(flags);
+#endif
 	return ret;
 }
 

From 80a787ba3809deb694ee632919badb73890daf05 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:16 -0500
Subject: [PATCH 245/351] dm: dont rewrite dm_disk(md)->part0.in_flight

generic_start_io_acct and generic_end_io_acct already update the variable
in_flight using atomic operations, so we don't have to overwrite them
again.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index ab72d79775ee..33100e536c4e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -663,8 +663,7 @@ static void start_io_acct(struct dm_io *io)
 	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 			      &dm_disk(md)->part0);
 
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
-		   atomic_inc_return(&md->pending[rw]));
+	atomic_inc(&md->pending[rw]);
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -693,7 +692,6 @@ static void end_io_acct(struct dm_io *io)
 	 * a flush.
 	 */
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */

From dbd3bbd291a03f1383de6242e7999e8487cb869b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 6 Dec 2018 11:41:17 -0500
Subject: [PATCH 246/351] dm rq: leverage blk_mq_queue_busy() to check for
 outstanding IO

Now that request-based dm-multipath only supports blk-mq, make use of
the newly introduced blk_mq_queue_busy() to check for outstanding IO --
rather than (ab)using the block core's in_flight counters.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 1f1fe9a618ea..d2397d8fcbd1 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -130,11 +130,11 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
-	atomic_dec(&md->pending[rw]);
-
 	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
-		wake_up(&md->wait);
+	if (unlikely(waitqueue_active(&md->wait))) {
+		if (!blk_mq_queue_busy(md->queue))
+			wake_up(&md->wait);
+	}
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
@@ -436,7 +436,6 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
 	blk_mq_start_request(orig);
-	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
 	if (unlikely(dm_stats_used(&md->stats))) {
 		struct dm_rq_target_io *tio = tio_from_request(orig);

From 112f158f66cbe25fd561a5dfe9c3826e06abf757 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 6 Dec 2018 11:41:18 -0500
Subject: [PATCH 247/351] block: stop passing 'cpu' to all percpu stats methods

All of part_stat_* and related methods are used with preempt disabled,
so there is no need to pass cpu around to allow of them.  Just call
smp_processor_id() as needed.

Suggested-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 16 +++++++++-------
 block/blk-core.c          | 34 +++++++++++++++-------------------
 block/blk-merge.c         |  5 ++---
 block/genhd.c             |  5 ++---
 block/partition-generic.c |  5 ++---
 drivers/md/md.c           |  7 +++----
 include/linux/genhd.h     | 26 +++++++++++++-------------
 7 files changed, 46 insertions(+), 52 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 06760543ec81..0aca870331c3 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1668,11 +1668,12 @@ void generic_start_io_acct(struct request_queue *q, int op,
 			   unsigned long sectors, struct hd_struct *part)
 {
 	const int sgrp = op_stat_group(op);
-	int cpu = part_stat_lock();
 
-	part_round_stats(q, cpu, part);
-	part_stat_inc(cpu, part, ios[sgrp]);
-	part_stat_add(cpu, part, sectors[sgrp], sectors);
+	part_stat_lock();
+
+	part_round_stats(q, part);
+	part_stat_inc(part, ios[sgrp]);
+	part_stat_add(part, sectors[sgrp], sectors);
 	part_inc_in_flight(q, part, op_is_write(op));
 
 	part_stat_unlock();
@@ -1684,10 +1685,11 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
 {
 	unsigned long duration = jiffies - start_time;
 	const int sgrp = op_stat_group(req_op);
-	int cpu = part_stat_lock();
 
-	part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration));
-	part_round_stats(q, cpu, part);
+	part_stat_lock();
+
+	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+	part_round_stats(q, part);
 	part_dec_in_flight(q, part, op_is_write(req_op));
 
 	part_stat_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index ad59102ee30a..734b768c9d9d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -584,14 +584,14 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
 }
 EXPORT_SYMBOL(blk_get_request);
 
-static void part_round_stats_single(struct request_queue *q, int cpu,
+static void part_round_stats_single(struct request_queue *q,
 				    struct hd_struct *part, unsigned long now,
 				    unsigned int inflight)
 {
 	if (inflight) {
-		__part_stat_add(cpu, part, time_in_queue,
+		__part_stat_add(part, time_in_queue,
 				inflight * (now - part->stamp));
-		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
+		__part_stat_add(part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
@@ -599,7 +599,6 @@ static void part_round_stats_single(struct request_queue *q, int cpu,
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
  * @q: target block queue
- * @cpu: cpu number for stats access
  * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
@@ -613,7 +612,7 @@ static void part_round_stats_single(struct request_queue *q, int cpu,
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
-void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
+void part_round_stats(struct request_queue *q, struct hd_struct *part)
 {
 	struct hd_struct *part2 = NULL;
 	unsigned long now = jiffies;
@@ -635,9 +634,9 @@ void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part)
 	part_in_flight(q, part, inflight);
 
 	if (stats & 2)
-		part_round_stats_single(q, cpu, part2, now, inflight[1]);
+		part_round_stats_single(q, part2, now, inflight[1]);
 	if (stats & 1)
-		part_round_stats_single(q, cpu, part, now, inflight[0]);
+		part_round_stats_single(q, part, now, inflight[0]);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 
@@ -1362,11 +1361,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes)
 	if (blk_do_io_stat(req)) {
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
-		int cpu;
 
-		cpu = part_stat_lock();
+		part_stat_lock();
 		part = req->part;
-		part_stat_add(cpu, part, sectors[sgrp], bytes >> 9);
+		part_stat_add(part, sectors[sgrp], bytes >> 9);
 		part_stat_unlock();
 	}
 }
@@ -1381,14 +1379,13 @@ void blk_account_io_done(struct request *req, u64 now)
 	if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
 		const int sgrp = op_stat_group(req_op(req));
 		struct hd_struct *part;
-		int cpu;
 
-		cpu = part_stat_lock();
+		part_stat_lock();
 		part = req->part;
 
-		part_stat_inc(cpu, part, ios[sgrp]);
-		part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns);
-		part_round_stats(req->q, cpu, part);
+		part_stat_inc(part, ios[sgrp]);
+		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
+		part_round_stats(req->q, part);
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
@@ -1400,16 +1397,15 @@ void blk_account_io_start(struct request *rq, bool new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
-	int cpu;
 
 	if (!blk_do_io_stat(rq))
 		return;
 
-	cpu = part_stat_lock();
+	part_stat_lock();
 
 	if (!new_io) {
 		part = rq->part;
-		part_stat_inc(cpu, part, merges[rw]);
+		part_stat_inc(part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		if (!hd_struct_try_get(part)) {
@@ -1424,7 +1420,7 @@ void blk_account_io_start(struct request *rq, bool new_io)
 			part = &rq->rq_disk->part0;
 			hd_struct_get(part);
 		}
-		part_round_stats(rq->q, cpu, part);
+		part_round_stats(rq->q, part);
 		part_inc_in_flight(rq->q, part, rw);
 		rq->part = part;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4431da69a5cf..a120d59b9705 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -685,12 +685,11 @@ static void blk_account_io_merge(struct request *req)
 {
 	if (blk_do_io_stat(req)) {
 		struct hd_struct *part;
-		int cpu;
 
-		cpu = part_stat_lock();
+		part_stat_lock();
 		part = req->part;
 
-		part_round_stats(req->q, cpu, part);
+		part_round_stats(req->q, part);
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
diff --git a/block/genhd.c b/block/genhd.c
index 0145bcb0cc76..2fe00cf32b93 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1326,7 +1326,6 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
 	unsigned int inflight[2];
-	int cpu;
 
 	/*
 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1338,8 +1337,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		cpu = part_stat_lock();
-		part_round_stats(gp->queue, cpu, hd);
+		part_stat_lock();
+		part_round_stats(gp->queue, hd);
 		part_stat_unlock();
 		part_in_flight(gp->queue, hd, inflight);
 		seq_printf(seqf, "%4d %7d %s "
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 5f8db5c5140f..7e663cfb1487 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -121,10 +121,9 @@ ssize_t part_stat_show(struct device *dev,
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q = part_to_disk(p)->queue;
 	unsigned int inflight[2];
-	int cpu;
 
-	cpu = part_stat_lock();
-	part_round_stats(q, cpu, p);
+	part_stat_lock();
+	part_round_stats(q, p);
 	part_stat_unlock();
 	part_in_flight(q, p, inflight);
 	return sprintf(buf,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fc488cb30a94..9a0a1e0934d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -334,7 +334,6 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 	const int sgrp = op_stat_group(bio_op(bio));
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
-	int cpu;
 
 	blk_queue_split(q, &bio);
 
@@ -359,9 +358,9 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
 
 	md_handle_request(mddev, bio);
 
-	cpu = part_stat_lock();
-	part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]);
-	part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors);
+	part_stat_lock();
+	part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
+	part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
 	part_stat_unlock();
 
 	return BLK_QC_T_NONE;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 0c5ee17b4d88..1677cd2a4c4e 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -295,8 +295,8 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 #define part_stat_lock()	({ rcu_read_lock(); get_cpu(); })
 #define part_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
 
-#define __part_stat_add(cpu, part, field, addnd)			\
-	(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
+#define __part_stat_add(part, field, addnd)				\
+	(per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd))
 
 #define part_stat_read(part, field)					\
 ({									\
@@ -333,7 +333,7 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_lock()	({ rcu_read_lock(); 0; })
 #define part_stat_unlock()	rcu_read_unlock()
 
-#define __part_stat_add(cpu, part, field, addnd)				\
+#define __part_stat_add(part, field, addnd)				\
 	((part)->dkstats.field += addnd)
 
 #define part_stat_read(part, field)	((part)->dkstats.field)
@@ -362,19 +362,19 @@ static inline void free_part_stats(struct hd_struct *part)
 	 part_stat_read(part, field[STAT_WRITE]) +			\
 	 part_stat_read(part, field[STAT_DISCARD]))
 
-#define part_stat_add(cpu, part, field, addnd)	do {			\
-	__part_stat_add((cpu), (part), field, addnd);			\
+#define part_stat_add(part, field, addnd)	do {			\
+	__part_stat_add((part), field, addnd);				\
 	if ((part)->partno)						\
-		__part_stat_add((cpu), &part_to_disk((part))->part0,	\
+		__part_stat_add(&part_to_disk((part))->part0,		\
 				field, addnd);				\
 } while (0)
 
-#define part_stat_dec(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, -1)
-#define part_stat_inc(cpu, gendiskp, field)				\
-	part_stat_add(cpu, gendiskp, field, 1)
-#define part_stat_sub(cpu, gendiskp, field, subnd)			\
-	part_stat_add(cpu, gendiskp, field, -subnd)
+#define part_stat_dec(gendiskp, field)					\
+	part_stat_add(gendiskp, field, -1)
+#define part_stat_inc(gendiskp, field)					\
+	part_stat_add(gendiskp, field, 1)
+#define part_stat_sub(gendiskp, field, subnd)				\
+	part_stat_add(gendiskp, field, -subnd)
 
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2]);
@@ -399,7 +399,7 @@ static inline void free_part_info(struct hd_struct *part)
 }
 
 /* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part);
+extern void part_round_stats(struct request_queue *q, struct hd_struct *part);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,

From 5b18b5a737600fd20ba2045f320d5926ebbf341a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:19 -0500
Subject: [PATCH 248/351] block: delete part_round_stats and switch to less
 precise counting

We want to convert to per-cpu in_flight counters.

The function part_round_stats needs the in_flight counter every jiffy, it
would be too costly to sum all the percpu variables every jiffy, so it
must be deleted. part_round_stats is used to calculate two counters -
time_in_queue and io_ticks.

time_in_queue can be calculated without part_round_stats, by adding the
duration of the I/O when the I/O ends (the value is almost as exact as the
previously calculated value, except that time for in-progress I/Os is not
counted).

io_ticks can be approximated by increasing the value when I/O is started
or ended and the jiffies value has changed. If the I/Os take less than a
jiffy, the value is as exact as the previously calculated value. If the
I/Os take more than a jiffy, io_ticks can drift behind the previously
calculated value.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c               | 24 +++++++++++++--
 block/blk-core.c          | 62 +++------------------------------------
 block/blk-merge.c         |  1 -
 block/genhd.c             |  3 --
 block/partition-generic.c |  3 --
 include/linux/genhd.h     |  3 +-
 6 files changed, 26 insertions(+), 70 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 0aca870331c3..036e3f0cc736 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1664,6 +1664,22 @@ defer:
 }
 EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
+void update_io_ticks(struct hd_struct *part, unsigned long now)
+{
+	unsigned long stamp;
+again:
+	stamp = READ_ONCE(part->stamp);
+	if (unlikely(stamp != now)) {
+		if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
+			__part_stat_add(part, io_ticks, 1);
+		}
+	}
+	if (part->partno) {
+		part = &part_to_disk(part)->part0;
+		goto again;
+	}
+}
+
 void generic_start_io_acct(struct request_queue *q, int op,
 			   unsigned long sectors, struct hd_struct *part)
 {
@@ -1671,7 +1687,7 @@ void generic_start_io_acct(struct request_queue *q, int op,
 
 	part_stat_lock();
 
-	part_round_stats(q, part);
+	update_io_ticks(part, jiffies);
 	part_stat_inc(part, ios[sgrp]);
 	part_stat_add(part, sectors[sgrp], sectors);
 	part_inc_in_flight(q, part, op_is_write(op));
@@ -1683,13 +1699,15 @@ EXPORT_SYMBOL(generic_start_io_acct);
 void generic_end_io_acct(struct request_queue *q, int req_op,
 			 struct hd_struct *part, unsigned long start_time)
 {
-	unsigned long duration = jiffies - start_time;
+	unsigned long now = jiffies;
+	unsigned long duration = now - start_time;
 	const int sgrp = op_stat_group(req_op);
 
 	part_stat_lock();
 
+	update_io_ticks(part, now);
 	part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
-	part_round_stats(q, part);
+	part_stat_add(part, time_in_queue, duration);
 	part_dec_in_flight(q, part, op_is_write(req_op));
 
 	part_stat_unlock();
diff --git a/block/blk-core.c b/block/blk-core.c
index 734b768c9d9d..268d2b8e9843 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -584,62 +584,6 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op,
 }
 EXPORT_SYMBOL(blk_get_request);
 
-static void part_round_stats_single(struct request_queue *q,
-				    struct hd_struct *part, unsigned long now,
-				    unsigned int inflight)
-{
-	if (inflight) {
-		__part_stat_add(part, time_in_queue,
-				inflight * (now - part->stamp));
-		__part_stat_add(part, io_ticks, (now - part->stamp));
-	}
-	part->stamp = now;
-}
-
-/**
- * part_round_stats() - Round off the performance stats on a struct disk_stats.
- * @q: target block queue
- * @part: target partition
- *
- * The average IO queue length and utilisation statistics are maintained
- * by observing the current state of the queue length and the amount of
- * time it has been in this state for.
- *
- * Normally, that accounting is done on IO completion, but that can result
- * in more than a second's worth of IO being accounted for within any one
- * second, leading to >100% utilisation.  To deal with that, we call this
- * function to do a round-off before returning the results when reading
- * /proc/diskstats.  This accounts immediately for all queue usage up to
- * the current jiffies and restarts the counters again.
- */
-void part_round_stats(struct request_queue *q, struct hd_struct *part)
-{
-	struct hd_struct *part2 = NULL;
-	unsigned long now = jiffies;
-	unsigned int inflight[2];
-	int stats = 0;
-
-	if (part->stamp != now)
-		stats |= 1;
-
-	if (part->partno) {
-		part2 = &part_to_disk(part)->part0;
-		if (part2->stamp != now)
-			stats |= 2;
-	}
-
-	if (!stats)
-		return;
-
-	part_in_flight(q, part, inflight);
-
-	if (stats & 2)
-		part_round_stats_single(q, part2, now, inflight[1]);
-	if (stats & 1)
-		part_round_stats_single(q, part, now, inflight[0]);
-}
-EXPORT_SYMBOL_GPL(part_round_stats);
-
 void blk_put_request(struct request *req)
 {
 	blk_mq_free_request(req);
@@ -1383,9 +1327,10 @@ void blk_account_io_done(struct request *req, u64 now)
 		part_stat_lock();
 		part = req->part;
 
+		update_io_ticks(part, jiffies);
 		part_stat_inc(part, ios[sgrp]);
 		part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
-		part_round_stats(req->q, part);
+		part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
@@ -1420,11 +1365,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
 			part = &rq->rq_disk->part0;
 			hd_struct_get(part);
 		}
-		part_round_stats(rq->q, part);
 		part_inc_in_flight(rq->q, part, rw);
 		rq->part = part;
 	}
 
+	update_io_ticks(part, jiffies);
+
 	part_stat_unlock();
 }
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index a120d59b9705..9da5629d0887 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -689,7 +689,6 @@ static void blk_account_io_merge(struct request *req)
 		part_stat_lock();
 		part = req->part;
 
-		part_round_stats(req->q, part);
 		part_dec_in_flight(req->q, part, rq_data_dir(req));
 
 		hd_struct_put(part);
diff --git a/block/genhd.c b/block/genhd.c
index 2fe00cf32b93..cdf174d7d329 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1337,9 +1337,6 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		part_stat_lock();
-		part_round_stats(gp->queue, hd);
-		part_stat_unlock();
 		part_in_flight(gp->queue, hd, inflight);
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 7e663cfb1487..42d6138ac876 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -122,9 +122,6 @@ ssize_t part_stat_show(struct device *dev,
 	struct request_queue *q = part_to_disk(p)->queue;
 	unsigned int inflight[2];
 
-	part_stat_lock();
-	part_round_stats(q, p);
-	part_stat_unlock();
 	part_in_flight(q, p, inflight);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 1677cd2a4c4e..838c2a7a40c5 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -398,8 +398,7 @@ static inline void free_part_info(struct hd_struct *part)
 	kfree(part->info);
 }
 
-/* block/blk-core.c */
-extern void part_round_stats(struct request_queue *q, struct hd_struct *part);
+void update_io_ticks(struct hd_struct *part, unsigned long now);
 
 /* block/genhd.c */
 extern void device_add_disk(struct device *parent, struct gendisk *disk,

From 1226b8dd0e91331cfab500f305b2c264445a0392 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:20 -0500
Subject: [PATCH 249/351] block: switch to per-cpu in-flight counters

Now when part_round_stats is gone, we can switch to per-cpu in-flight
counters.

We use the local-atomic type local_t, so that if part_inc_in_flight or
part_dec_in_flight is reentrantly called from an interrupt, the value will
be correct.

The other counters could be corrupted due to reentrant interrupt, but the
corruption only results in slight counter skew - the in_flight counter
must be exact, so it needs local_t.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c         | 43 +++++++++++++++++++++++++++++++++----------
 include/linux/genhd.h | 29 ++++++++++++++++++++++-------
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index cdf174d7d329..9827a2c05db7 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -50,9 +50,9 @@ void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 	if (queue_is_mq(q))
 		return;
 
-	atomic_inc(&part->in_flight[rw]);
+	part_stat_local_inc(part, in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
 }
 
 void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
@@ -60,38 +60,61 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 	if (queue_is_mq(q))
 		return;
 
-	atomic_dec(&part->in_flight[rw]);
+	part_stat_local_dec(part, in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
 }
 
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2])
 {
+	int cpu;
+
 	if (queue_is_mq(q)) {
 		blk_mq_in_flight(q, part, inflight);
 		return;
 	}
 
-	inflight[0] = atomic_read(&part->in_flight[0]) +
-			atomic_read(&part->in_flight[1]);
+	inflight[0] = 0;
+	for_each_possible_cpu(cpu) {
+		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu) +
+			       part_stat_local_read_cpu(part, in_flight[1], cpu);
+	}
+	if ((int)inflight[0] < 0)
+		inflight[0] = 0;
+
 	if (part->partno) {
 		part = &part_to_disk(part)->part0;
-		inflight[1] = atomic_read(&part->in_flight[0]) +
-				atomic_read(&part->in_flight[1]);
+		inflight[1] = 0;
+		for_each_possible_cpu(cpu) {
+			inflight[1] += part_stat_local_read_cpu(part, in_flight[0], cpu) +
+				       part_stat_local_read_cpu(part, in_flight[1], cpu);
+		}
+		if ((int)inflight[1] < 0)
+			inflight[1] = 0;
 	}
 }
 
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 		       unsigned int inflight[2])
 {
+	int cpu;
+
 	if (queue_is_mq(q)) {
 		blk_mq_in_flight_rw(q, part, inflight);
 		return;
 	}
 
-	inflight[0] = atomic_read(&part->in_flight[0]);
-	inflight[1] = atomic_read(&part->in_flight[1]);
+	inflight[0] = 0;
+	inflight[1] = 0;
+	for_each_possible_cpu(cpu) {
+		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu);
+		inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu);
+	}
+	if ((int)inflight[0] < 0)
+		inflight[0] = 0;
+	if ((int)inflight[1] < 0)
+		inflight[1] = 0;
 }
 
 struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 838c2a7a40c5..636b4f687e35 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -17,6 +17,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
 #include <linux/blk_types.h>
+#include <asm/local.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -89,6 +90,7 @@ struct disk_stats {
 	unsigned long merges[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+	local_t in_flight[2];
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
@@ -122,7 +124,6 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -295,8 +296,11 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 #define part_stat_lock()	({ rcu_read_lock(); get_cpu(); })
 #define part_stat_unlock()	do { put_cpu(); rcu_read_unlock(); } while (0)
 
-#define __part_stat_add(part, field, addnd)				\
-	(per_cpu_ptr((part)->dkstats, smp_processor_id())->field += (addnd))
+#define part_stat_get_cpu(part, field, cpu)					\
+	(per_cpu_ptr((part)->dkstats, (cpu))->field)
+
+#define part_stat_get(part, field)					\
+	part_stat_get_cpu(part, field, smp_processor_id())
 
 #define part_stat_read(part, field)					\
 ({									\
@@ -333,10 +337,9 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_lock()	({ rcu_read_lock(); 0; })
 #define part_stat_unlock()	rcu_read_unlock()
 
-#define __part_stat_add(part, field, addnd)				\
-	((part)->dkstats.field += addnd)
-
-#define part_stat_read(part, field)	((part)->dkstats.field)
+#define part_stat_get(part, field)		((part)->dkstats.field)
+#define part_stat_get_cpu(part, field, cpu)	part_stat_get(part, field)
+#define part_stat_read(part, field)		part_stat_get(part, field)
 
 static inline void part_stat_set_all(struct hd_struct *part, int value)
 {
@@ -362,6 +365,9 @@ static inline void free_part_stats(struct hd_struct *part)
 	 part_stat_read(part, field[STAT_WRITE]) +			\
 	 part_stat_read(part, field[STAT_DISCARD]))
 
+#define __part_stat_add(part, field, addnd)				\
+	(part_stat_get(part, field) += (addnd))
+
 #define part_stat_add(part, field, addnd)	do {			\
 	__part_stat_add((part), field, addnd);				\
 	if ((part)->partno)						\
@@ -376,6 +382,15 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(gendiskp, field, subnd)				\
 	part_stat_add(gendiskp, field, -subnd)
 
+#define part_stat_local_dec(gendiskp, field)				\
+	local_dec(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_inc(gendiskp, field)				\
+	local_inc(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_read(gendiskp, field)				\
+	local_read(&(part_stat_get(gendiskp, field)))
+#define part_stat_local_read_cpu(gendiskp, field, cpu)			\
+	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
+
 void part_in_flight(struct request_queue *q, struct hd_struct *part,
 		    unsigned int inflight[2]);
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,

From e016b78201a2d9ff40f3f0da072292689af24c7f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:21 -0500
Subject: [PATCH 250/351] block: return just one value from part_in_flight

The previous patches deleted all the code that needed the second value
returned from part_in_flight - now the kernel only uses the first value.

Consequently, part_in_flight (and blk_mq_in_flight) may be changed so that
it only returns one value.

This patch just refactors the code, there's no functional change.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c            | 12 +++++-------
 block/blk-mq.h            |  3 +--
 block/genhd.c             | 34 ++++++++++++----------------------
 block/partition-generic.c |  6 +++---
 include/linux/genhd.h     |  3 +--
 5 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b645275dfe5f..9690f4f8de7e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -100,25 +100,23 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
 	struct mq_inflight *mi = priv;
 
 	/*
-	 * index[0] counts the specific partition that was asked for. index[1]
-	 * counts the ones that are active on the whole device, so increment
-	 * that if mi->part is indeed a partition, and not a whole device.
+	 * index[0] counts the specific partition that was asked for.
 	 */
 	if (rq->part == mi->part)
 		mi->inflight[0]++;
-	if (mi->part->partno)
-		mi->inflight[1]++;
 
 	return true;
 }
 
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
-		      unsigned int inflight[2])
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
 {
+	unsigned inflight[2];
 	struct mq_inflight mi = { .part = part, .inflight = inflight, };
 
 	inflight[0] = inflight[1] = 0;
 	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+
+	return inflight[0];
 }
 
 static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a664ea44ffd4..0c9c9ea2fefe 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -187,8 +187,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 	return hctx->nr_ctx && hctx->tags;
 }
 
-void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
-		      unsigned int inflight[2]);
+unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
 void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 			 unsigned int inflight[2]);
 
diff --git a/block/genhd.c b/block/genhd.c
index 9827a2c05db7..1dd8fd6613b8 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -65,34 +65,24 @@ void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
 		part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
 }
 
-void part_in_flight(struct request_queue *q, struct hd_struct *part,
-		    unsigned int inflight[2])
+unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
 {
 	int cpu;
+	unsigned int inflight;
 
 	if (queue_is_mq(q)) {
-		blk_mq_in_flight(q, part, inflight);
-		return;
+		return blk_mq_in_flight(q, part);
 	}
 
-	inflight[0] = 0;
+	inflight = 0;
 	for_each_possible_cpu(cpu) {
-		inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu) +
-			       part_stat_local_read_cpu(part, in_flight[1], cpu);
+		inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
+			    part_stat_local_read_cpu(part, in_flight[1], cpu);
 	}
-	if ((int)inflight[0] < 0)
-		inflight[0] = 0;
+	if ((int)inflight < 0)
+		inflight = 0;
 
-	if (part->partno) {
-		part = &part_to_disk(part)->part0;
-		inflight[1] = 0;
-		for_each_possible_cpu(cpu) {
-			inflight[1] += part_stat_local_read_cpu(part, in_flight[0], cpu) +
-				       part_stat_local_read_cpu(part, in_flight[1], cpu);
-		}
-		if ((int)inflight[1] < 0)
-			inflight[1] = 0;
-	}
+	return inflight;
 }
 
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
@@ -1348,7 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct disk_part_iter piter;
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
-	unsigned int inflight[2];
+	unsigned int inflight;
 
 	/*
 	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1360,7 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
-		part_in_flight(gp->queue, hd, inflight);
+		inflight = part_in_flight(gp->queue, hd);
 		seq_printf(seqf, "%4d %7d %s "
 			   "%lu %lu %lu %u "
 			   "%lu %lu %lu %u "
@@ -1376,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[STAT_WRITE]),
 			   part_stat_read(hd, sectors[STAT_WRITE]),
 			   (unsigned int)part_stat_read_msecs(hd, STAT_WRITE),
-			   inflight[0],
+			   inflight,
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue)),
 			   part_stat_read(hd, ios[STAT_DISCARD]),
diff --git a/block/partition-generic.c b/block/partition-generic.c
index 42d6138ac876..8e596a8dff32 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -120,9 +120,9 @@ ssize_t part_stat_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q = part_to_disk(p)->queue;
-	unsigned int inflight[2];
+	unsigned int inflight;
 
-	part_in_flight(q, p, inflight);
+	inflight = part_in_flight(q, p);
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
@@ -137,7 +137,7 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[STAT_WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[STAT_WRITE]),
 		(unsigned int)part_stat_read_msecs(p, STAT_WRITE),
-		inflight[0],
+		inflight,
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)),
 		part_stat_read(p, ios[STAT_DISCARD]),
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 636b4f687e35..06c0fd594097 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -391,8 +391,7 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_local_read_cpu(gendiskp, field, cpu)			\
 	local_read(&(part_stat_get_cpu(gendiskp, field, cpu)))
 
-void part_in_flight(struct request_queue *q, struct hd_struct *part,
-		    unsigned int inflight[2]);
+unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part);
 void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
 		       unsigned int inflight[2]);
 void part_dec_in_flight(struct request_queue *q, struct hd_struct *part,

From 6f75723190d88e1319bea623bfe0292bf3917965 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Thu, 6 Dec 2018 11:41:22 -0500
Subject: [PATCH 251/351] dm: remove the pending IO accounting

Remove the "pending" atomic counters, that duplicate block-core's
in_flight counters, and update md_in_flight() to look at percpu
in_flight counters.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-core.h |  2 --
 drivers/md/dm.c      | 34 +++++++++++++++-------------------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index 224d44503a06..6fe883fac471 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -65,7 +65,6 @@ struct mapped_device {
 	 */
 	struct work_struct work;
 	wait_queue_head_t wait;
-	atomic_t pending[2];
 	spinlock_t deferred_lock;
 	struct bio_list deferred;
 
@@ -119,7 +118,6 @@ struct mapped_device {
 	struct srcu_struct io_barrier;
 };
 
-int md_in_flight(struct mapped_device *md);
 void disable_write_same(struct mapped_device *md);
 void disable_write_zeroes(struct mapped_device *md);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 33100e536c4e..70568f8b6c53 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,25 +646,30 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-int md_in_flight(struct mapped_device *md)
+static bool md_in_flight(struct mapped_device *md)
 {
-	return atomic_read(&md->pending[READ]) +
-	       atomic_read(&md->pending[WRITE]);
+	int cpu;
+	struct hd_struct *part = &dm_disk(md)->part0;
+
+	for_each_possible_cpu(cpu) {
+		if (part_stat_local_read_cpu(part, in_flight[0], cpu) ||
+		    part_stat_local_read_cpu(part, in_flight[1], cpu))
+			return true;
+	}
+
+	return false;
 }
 
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
-	int rw = bio_data_dir(bio);
 
 	io->start_time = jiffies;
 
 	generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 			      &dm_disk(md)->part0);
 
-	atomic_inc(&md->pending[rw]);
-
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
 				    bio->bi_iter.bi_sector, bio_sectors(bio),
@@ -676,8 +681,6 @@ static void end_io_acct(struct dm_io *io)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->orig_bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending;
-	int rw = bio_data_dir(bio);
 
 	generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
 			    io->start_time);
@@ -687,16 +690,11 @@ static void end_io_acct(struct dm_io *io)
 				    bio->bi_iter.bi_sector, bio_sectors(bio),
 				    true, duration, &io->stats_aux);
 
-	/*
-	 * After this is decremented the bio must not be touched if it is
-	 * a flush.
-	 */
-	pending = atomic_dec_return(&md->pending[rw]);
-	pending += atomic_read(&md->pending[rw^0x1]);
-
 	/* nudge anyone waiting on suspend queue */
-	if (!pending)
-		wake_up(&md->wait);
+	if (unlikely(waitqueue_active(&md->wait))) {
+		if (!md_in_flight(md))
+			wake_up(&md->wait);
+	}
 }
 
 /*
@@ -1915,8 +1913,6 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->disk)
 		goto bad;
 
-	atomic_set(&md->pending[0], 0);
-	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);

From e4025e46f093d4549d3043c2c54d444cec480d2b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 10 Dec 2018 22:34:39 +0100
Subject: [PATCH 252/351] mtip32xx: avoid using semaphores

The "cmd_slot_unal" semaphore is never used in a blocking way
but only as an atomic counter. Change the code to using
atomic_dec_if_positive() as a better API.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 6 +++---
 drivers/block/mtip32xx/mtip32xx.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 2b0ac9d01e51..d7dc307d0cc8 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -2680,7 +2680,7 @@ static void mtip_softirq_done_fn(struct request *rq)
 							cmd->direction);
 
 	if (unlikely(cmd->unaligned))
-		up(&dd->port->cmd_slot_unal);
+		atomic_inc(&dd->port->cmd_slot_unal);
 
 	blk_mq_end_request(rq, cmd->status);
 }
@@ -2990,7 +2990,7 @@ static int mtip_hw_init(struct driver_data *dd)
 	else
 		dd->unal_qdepth = 0;
 
-	sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+	atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth);
 
 	/* Spinlock to prevent concurrent issue */
 	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
@@ -3533,7 +3533,7 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
 			cmd->unaligned = 1;
 	}
 
-	if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+	if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0)
 		return true;
 
 	return false;
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index c33f8c3d9fb4..abce25f27f57 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -433,8 +433,8 @@ struct mtip_port {
 	 */
 	unsigned long ic_pause_timer;
 
-	/* Semaphore to control queue depth of unaligned IOs */
-	struct semaphore cmd_slot_unal;
+	/* Counter to control queue depth of unaligned IOs */
+	atomic_t cmd_slot_unal;
 
 	/* Spinlock for working around command-issue bug. */
 	spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];

From 4ba09f69e20d0a768d91277847ddbd31f476590e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Dec 2018 14:45:19 -0700
Subject: [PATCH 253/351] mtip32xx: use BLK_STS_DEV_RESOURCE for device
 resources

For cases where we can only fail with IO in-flight, we should be using
BLK_STS_DEV_RESOURCE instead of BLK_STS_RESOURCE. The latter refers to
system wide resource constraints.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/mtip32xx/mtip32xx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index d7dc307d0cc8..88e8440e75c3 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3550,7 +3550,7 @@ static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
 	struct mtip_cmd_sg *command_sg;
 
 	if (mtip_commands_active(dd->port))
-		return BLK_STS_RESOURCE;
+		return BLK_STS_DEV_RESOURCE;
 
 	hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
 	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
@@ -3587,7 +3587,7 @@ static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
 		return mtip_issue_reserved_cmd(hctx, rq);
 
 	if (unlikely(mtip_check_unal_depth(hctx, rq)))
-		return BLK_STS_RESOURCE;
+		return BLK_STS_DEV_RESOURCE;
 
 	if (is_se_active(dd) || is_stopped(dd, rq))
 		return BLK_STS_IOERR;

From b7934ba4147a883f7a1b32c6408179274a4d6ed1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 10 Dec 2018 15:45:53 -0700
Subject: [PATCH 254/351] dm: fix inflight IO check

After switching to percpu inflight counters, the inflight check
is totally buggy. It's perfectly valid for some counters to be
non-zero while having a total inflight IO count of 0, that's how
these kinds of counters work (inc on one CPU, dec on another).
Fix the md_in_flight() check to sum all counters before returning
a false positive, potentially.

While at it, remove the inflight read for IO completion. We don't
need it, just wake anyone that's waiting for the IO count to drop
to zero. The caller needs to re-check that value anyway when woken,
which it does.

Fixes: 6f75723190d8 ("dm: remove the pending IO accounting")
Acked-by: Mike Snitzer <snitzer@redhat.com>
Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 70568f8b6c53..79ad4b3d215c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -650,14 +650,14 @@ static bool md_in_flight(struct mapped_device *md)
 {
 	int cpu;
 	struct hd_struct *part = &dm_disk(md)->part0;
+	long sum = 0;
 
 	for_each_possible_cpu(cpu) {
-		if (part_stat_local_read_cpu(part, in_flight[0], cpu) ||
-		    part_stat_local_read_cpu(part, in_flight[1], cpu))
-			return true;
+		sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+		sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
 	}
 
-	return false;
+	return sum != 0;
 }
 
 static void start_io_acct(struct dm_io *io)
@@ -691,10 +691,8 @@ static void end_io_acct(struct dm_io *io)
 				    true, duration, &io->stats_aux);
 
 	/* nudge anyone waiting on suspend queue */
-	if (unlikely(waitqueue_active(&md->wait))) {
-		if (!md_in_flight(md))
-			wake_up(&md->wait);
-	}
+	if (unlikely(waitqueue_active(&md->wait)))
+		wake_up(&md->wait);
 }
 
 /*

From 6451fe73fa0f542a49bfacd7205b88a597897f58 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 9 Dec 2018 11:21:45 -0700
Subject: [PATCH 255/351] nvme: fix irq vs io_queue calculations

Guenter reported an boot hang issue on HPPA after we default to 0 poll
queues. We have two issues in the queue count calculations:

1) We don't separate the poll queues from the read/write queues. This is
   important, since the former doesn't need interrupts.
2) The adjust logic is broken.

Adjust the poll queue count before doing nvme_calc_io_queues(). The poll
queue count is only limited by the IO queue count we were able to get
from the controller, not failures in the IRQ allocation loop. This
leaves nvme_calc_io_queues() just adjusting the read/write queue map.

Reported-by: Reported-by: Guenter Roeck <linux@roeck-us.net>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 64 +++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7732c4979a4e..fb9d8270f32c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2030,60 +2030,40 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
+static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
 {
 	unsigned int this_w_queues = write_queues;
-	unsigned int this_p_queues = poll_queues;
 
 	/*
 	 * Setup read/write queue split
 	 */
-	if (nr_io_queues == 1) {
+	if (irq_queues == 1) {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
-		dev->io_queues[HCTX_TYPE_POLL] = 0;
 		return;
 	}
 
-	/*
-	 * Configure number of poll queues, if set
-	 */
-	if (this_p_queues) {
-		/*
-		 * We need at least one queue left. With just one queue, we'll
-		 * have a single shared read/write set.
-		 */
-		if (this_p_queues >= nr_io_queues) {
-			this_w_queues = 0;
-			this_p_queues = nr_io_queues - 1;
-		}
-
-		dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
-		nr_io_queues -= this_p_queues;
-	} else
-		dev->io_queues[HCTX_TYPE_POLL] = 0;
-
 	/*
 	 * If 'write_queues' is set, ensure it leaves room for at least
 	 * one read queue
 	 */
-	if (this_w_queues >= nr_io_queues)
-		this_w_queues = nr_io_queues - 1;
+	if (this_w_queues >= irq_queues)
+		this_w_queues = irq_queues - 1;
 
 	/*
 	 * If 'write_queues' is set to zero, reads and writes will share
 	 * a queue set.
 	 */
 	if (!this_w_queues) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
+		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
 		dev->io_queues[HCTX_TYPE_READ] = 0;
 	} else {
 		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
-		dev->io_queues[HCTX_TYPE_READ] = nr_io_queues - this_w_queues;
+		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
 	}
 }
 
-static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
+static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	int irq_sets[2];
@@ -2093,6 +2073,20 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		.sets = irq_sets,
 	};
 	int result = 0;
+	unsigned int irq_queues, this_p_queues;
+
+	/*
+	 * Poll queues don't need interrupts, but we need at least one IO
+	 * queue left over for non-polled IO.
+	 */
+	this_p_queues = poll_queues;
+	if (this_p_queues >= nr_io_queues) {
+		this_p_queues = nr_io_queues - 1;
+		irq_queues = 1;
+	} else {
+		irq_queues = nr_io_queues - this_p_queues;
+	}
+	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
 	/*
 	 * For irq sets, we have to ask for minvec == maxvec. This passes
@@ -2100,7 +2094,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 	 * IRQ vector needs.
 	 */
 	do {
-		nvme_calc_io_queues(dev, nr_io_queues);
+		nvme_calc_io_queues(dev, irq_queues);
 		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
 		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
 		if (!irq_sets[1])
@@ -2111,11 +2105,11 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		 * 1 + 1 queues, just ask for a single vector. We'll share
 		 * that between the single IO queue and the admin queue.
 		 */
-		if (!(result < 0 && nr_io_queues == 1))
-			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
+		if (result >= 0 && irq_queues > 1)
+			irq_queues = irq_sets[0] + irq_sets[1] + 1;
 
-		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
-				nr_io_queues,
+		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
+				irq_queues,
 				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 
 		/*
@@ -2125,12 +2119,12 @@ static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
 		 * likely does not. Back down to ask for just one vector.
 		 */
 		if (result == -ENOSPC) {
-			nr_io_queues--;
-			if (!nr_io_queues)
+			irq_queues--;
+			if (!irq_queues)
 				return result;
 			continue;
 		} else if (result == -EINVAL) {
-			nr_io_queues = 1;
+			irq_queues = 1;
 			continue;
 		} else if (result <= 0)
 			return -EIO;

From c4576aed8d85d808cd6443bda58393d525207d01 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 11 Dec 2018 09:10:26 -0500
Subject: [PATCH 256/351] dm: fix request-based dm's use of
 dm_wait_for_completion

The md->wait waitqueue is used by both bio-based and request-based DM.
Commit dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for
outstanding IO") lost sight of the requirement that
dm_wait_for_completion() must work with all types of DM devices.

Fix md_in_flight() to call the blk-mq or bio-based method accordingly.

Fixes: dbd3bbd291 ("dm rq: leverage blk_mq_queue_busy() to check for outstanding IO")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm-rq.c |  6 ++----
 drivers/md/dm.c    | 10 +++++++++-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d2397d8fcbd1..4e06be4f0a62 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -131,10 +131,8 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
 	/* nudge anyone waiting on suspend queue */
-	if (unlikely(waitqueue_active(&md->wait))) {
-		if (!blk_mq_queue_busy(md->queue))
-			wake_up(&md->wait);
-	}
+	if (unlikely(waitqueue_active(&md->wait)))
+		wake_up(&md->wait);
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 79ad4b3d215c..c414d40d645d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -646,7 +646,7 @@ static void free_tio(struct dm_target_io *tio)
 	bio_put(&tio->clone);
 }
 
-static bool md_in_flight(struct mapped_device *md)
+static bool md_in_flight_bios(struct mapped_device *md)
 {
 	int cpu;
 	struct hd_struct *part = &dm_disk(md)->part0;
@@ -660,6 +660,14 @@ static bool md_in_flight(struct mapped_device *md)
 	return sum != 0;
 }
 
+static bool md_in_flight(struct mapped_device *md)
+{
+	if (queue_is_mq(md->queue))
+		return blk_mq_queue_busy(md->queue);
+	else
+		return md_in_flight_bios(md);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;

From f40a62d2674b317a263512996f9a7abbfc8178ec Mon Sep 17 00:00:00 2001
From: Zhoujie Wu <zjwu@marvell.com>
Date: Tue, 11 Dec 2018 20:16:07 +0100
Subject: [PATCH 257/351] lightnvm: pblk: ignore the smeta oob area scan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The smeta area l2p mapping is empty, and actually the
recovery procedure only need to restore data sector's l2p
mapping. So ignore the smeta oob scan.

Signed-off-by: Zhoujie Wu <zjwu@marvell.com>
Reviewed-by: Javier González <javier@javigon.com>
Reviewed-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 5740b7509bd8..0fbd30e0a587 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -334,6 +334,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 			       struct pblk_recov_alloc p)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
+	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr *ppa_list;
 	struct pblk_sec_meta *meta_list;
@@ -342,12 +343,12 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	void *data;
 	dma_addr_t dma_ppa_list, dma_meta_list;
 	__le64 *lba_list;
-	u64 paddr = 0;
+	u64 paddr = pblk_line_smeta_start(pblk, line) + lm->smeta_sec;
 	bool padded = false;
 	int rq_ppas, rq_len;
 	int i, j;
 	int ret;
-	u64 left_ppas = pblk_sec_in_open_line(pblk, line);
+	u64 left_ppas = pblk_sec_in_open_line(pblk, line) - lm->smeta_sec;
 
 	if (pblk_line_wp_is_unbalanced(pblk, line))
 		pblk_warn(pblk, "recovering unbalanced line (%d)\n", line->id);

From 55e58c5e78aad9d3246f57e7718cf5ee7adde9e3 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Tue, 11 Dec 2018 20:16:08 +0100
Subject: [PATCH 258/351] lightnvm: Fix uninitialized return value in
 nvm_get_chunk_meta()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With gcc 4.1:

    drivers/lightnvm/core.c: In function ‘nvm_get_bb_meta’:
    drivers/lightnvm/core.c:977: warning: ‘ret’ may be used uninitialized in this function

and

    drivers/nvme/host/lightnvm.c: In function ‘nvme_nvm_get_chk_meta’:
    drivers/nvme/host/lightnvm.c:580: warning: ‘ret’ may be used uninitialized in this function

Indeed, if (for the former) the number of channels or LUNs is zero, or
(for both) the passed number of chunks is zero, ret will be returned
uninitialized.

Fix this by preinitializing ret to zero.

Fixes: aff3fb18f957de93 ("lightnvm: move bad block and chunk state logic to core")
Fixes: a294c199455187d1 ("lightnvm: implement get log report chunk helpers")
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 2 +-
 drivers/nvme/host/lightnvm.c | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 60ab11fcc81c..10e541cb8dc3 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -974,7 +974,7 @@ static int nvm_get_bb_meta(struct nvm_dev *dev, sector_t slba,
 	struct ppa_addr ppa;
 	u8 *blks;
 	int ch, lun, nr_blks;
-	int ret;
+	int ret = 0;
 
 	ppa.ppa = slba;
 	ppa = dev_to_generic_addr(dev, ppa);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index a4f3b263cd6c..d64805dc8efb 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -577,7 +577,8 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
 	struct ppa_addr ppa;
 	size_t left = nchks * sizeof(struct nvme_nvm_chk_meta);
 	size_t log_pos, offset, len;
-	int ret, i, max_len;
+	int i, max_len;
+	int ret = 0;
 
 	/*
 	 * limit requests to maximum 256K to avoid issuing arbitrary large

From 96076f7dde51f332bce5fc5644ddc1e221f64a5a Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:09 +0100
Subject: [PATCH 259/351] lightnvm: pblk: fix chunk close trace event check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The check for chunk closes suffers from an off-by-one issue, leading
to chunk close events not being traced.

Fixes: 4c44abf43d00 ("lightnvm: pblk: add trace events for chunk states")
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6944aac43b01..6581c35f51ee 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -531,7 +531,7 @@ void pblk_check_chunk_state_update(struct pblk *pblk, struct nvm_rq *rqd)
 		if (caddr == 0)
 			trace_pblk_chunk_state(pblk_disk_name(pblk),
 							ppa, NVM_CHK_ST_OPEN);
-		else if (caddr == chunk->cnlb)
+		else if (caddr == (chunk->cnlb - 1))
 			trace_pblk_chunk_state(pblk_disk_name(pblk),
 							ppa, NVM_CHK_ST_CLOSED);
 	}

From c12fa401ac8c94a74aff68bb5736b3f1dc695fa8 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:10 +0100
Subject: [PATCH 260/351] lightnvm: pblk: fix resubmission of overwritten write
 err lbas
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make sure we only look up valid lba addresses on the resubmission path.

If an lba is invalidated in the write buffer, that sector will be
submitted to disk (as it is already mapped to a ppa), and that write
might fail, resulting in a crash when trying to look up the lba in the
mapping table (as the lba is marked as invalid).

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index fa8726493b39..3ddd16f47106 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -148,9 +148,11 @@ static void pblk_prepare_resubmit(struct pblk *pblk, unsigned int sentry,
 		w_ctx = &entry->w_ctx;
 
 		/* Check if the lba has been overwritten */
-		ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
-		if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
-			w_ctx->lba = ADDR_EMPTY;
+		if (w_ctx->lba != ADDR_EMPTY) {
+			ppa_l2p = pblk_trans_map_get(pblk, w_ctx->lba);
+			if (!pblk_ppa_comp(ppa_l2p, entry->cacheline))
+				w_ctx->lba = ADDR_EMPTY;
+		}
 
 		/* Mark up the entry as submittable again */
 		flags = READ_ONCE(w_ctx->flags);

From ab3887be1e1a2594c9818a77b0f9dfabf0e4ab59 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:11 +0100
Subject: [PATCH 261/351] lightnvm: pblk: account for write error sectors in
 emeta
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Lines inflicted with write errors lines might be recovered
if they have not been recycled after write error garbage collection.

Ensure that the emeta accounting of valid lbas is correct
for such lines to avoid recovery inconsistencies.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-write.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 3ddd16f47106..750f04b8a227 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -105,14 +105,20 @@ retry:
 }
 
 /* Map remaining sectors in chunk, starting from ppa */
-static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
+static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa,
+		int rqd_ppas)
 {
 	struct pblk_line *line;
 	struct ppa_addr map_ppa = *ppa;
+	__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+	__le64 *lba_list;
 	u64 paddr;
 	int done = 0;
+	int n = 0;
 
 	line = pblk_ppa_to_line(pblk, *ppa);
+	lba_list = emeta_to_lbas(pblk, line->emeta->buf);
+
 	spin_lock(&line->lock);
 
 	while (!done)  {
@@ -121,10 +127,17 @@ static void pblk_map_remaining(struct pblk *pblk, struct ppa_addr *ppa)
 		if (!test_and_set_bit(paddr, line->map_bitmap))
 			line->left_msecs--;
 
+		if (n < rqd_ppas && lba_list[paddr] != addr_empty)
+			line->nr_valid_lbas--;
+
+		lba_list[paddr] = addr_empty;
+
 		if (!test_and_set_bit(paddr, line->invalid_bitmap))
 			le32_add_cpu(line->vsc, -1);
 
 		done = nvm_next_ppa_in_chk(pblk->dev, &map_ppa);
+
+		n++;
 	}
 
 	line->w_err_gc->has_write_err = 1;
@@ -202,7 +215,7 @@ static void pblk_submit_rec(struct work_struct *work)
 
 	pblk_log_write_err(pblk, rqd);
 
-	pblk_map_remaining(pblk, ppa_list);
+	pblk_map_remaining(pblk, ppa_list, rqd->nr_ppas);
 	pblk_queue_resubmit(pblk, c_ctx);
 
 	pblk_up_rq(pblk, c_ctx->lun_bitmap);

From 525f7bb2c9f9b2c6673854eade89e98fb3ba7802 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:12 +0100
Subject: [PATCH 262/351] lightnvm: pblk: stop writes gracefully when running
 out of lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If mapping fails (i.e. when running out of lines), handle the error
and stop writing.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-map.c   | 47 +++++++++++++++++++++--------------
 drivers/lightnvm/pblk-write.c | 30 ++++++++++++++--------
 drivers/lightnvm/pblk.h       |  4 +--
 3 files changed, 51 insertions(+), 30 deletions(-)

diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6dcbd44e3acb..5a3c28cce8ab 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -33,6 +33,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	int nr_secs = pblk->min_write_pgs;
 	int i;
 
+	if (!line)
+		return -ENOSPC;
+
 	if (pblk_line_is_full(line)) {
 		struct pblk_line *prev_line = line;
 
@@ -42,8 +45,11 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 		line = pblk_line_replace_data(pblk);
 		pblk_line_close_meta(pblk, prev_line);
 
-		if (!line)
-			return -EINTR;
+		if (!line) {
+			pblk_pipeline_stop(pblk);
+			return -ENOSPC;
+		}
+
 	}
 
 	emeta = line->emeta;
@@ -84,7 +90,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	return 0;
 }
 
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
@@ -93,20 +99,22 @@ void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
 	int i;
+	int ret;
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs)) {
-			bio_put(rqd->bio);
-			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-			pblk_pipeline_stop(pblk);
-		}
+
+		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+		if (ret)
+			return ret;
 	}
+
+	return 0;
 }
 
 /* only if erase_ppa is set, acquire erase semaphore */
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		       unsigned int sentry, unsigned long *lun_bitmap,
 		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
 {
@@ -119,15 +127,16 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
 	int i, erase_lun;
+	int ret;
+
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
-		if (pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs)) {
-			bio_put(rqd->bio);
-			pblk_free_rqd(pblk, rqd, PBLK_WRITE);
-			pblk_pipeline_stop(pblk);
-		}
+
+		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
+					lun_bitmap, &meta_list[i], map_secs);
+		if (ret)
+			return ret;
 
 		erase_lun = pblk_ppa_to_pos(geo, ppa_list[i]);
 
@@ -163,7 +172,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	 */
 	e_line = pblk_line_get_erase(pblk);
 	if (!e_line)
-		return;
+		return -ENOSPC;
 
 	/* Erase blocks that are bad in this line but might not be in next */
 	if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
@@ -174,7 +183,7 @@ retry:
 		bit = find_next_bit(d_line->blk_bitmap,
 						lm->blk_per_line, bit + 1);
 		if (bit >= lm->blk_per_line)
-			return;
+			return 0;
 
 		spin_lock(&e_line->lock);
 		if (test_bit(bit, e_line->erase_bitmap)) {
@@ -188,4 +197,6 @@ retry:
 		*erase_ppa = pblk->luns[bit].bppa; /* set ch and lun */
 		erase_ppa->a.blk = e_line->id;
 	}
+
+	return 0;
 }
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 750f04b8a227..2bf78f81862d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -334,12 +334,13 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	}
 
 	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
-		pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, valid, 0);
+		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+							valid, 0);
 	else
-		pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
+		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
 							valid, erase_ppa);
 
-	return 0;
+	return ret;
 }
 
 static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
@@ -563,7 +564,7 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd)
 							c_ctx->nr_padded);
 }
 
-static int pblk_submit_write(struct pblk *pblk)
+static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 {
 	struct bio *bio;
 	struct nvm_rq *rqd;
@@ -572,6 +573,8 @@ static int pblk_submit_write(struct pblk *pblk)
 	unsigned long pos;
 	unsigned int resubmit;
 
+	*secs_left = 0;
+
 	spin_lock(&pblk->resubmit_lock);
 	resubmit = !list_empty(&pblk->resubmit_list);
 	spin_unlock(&pblk->resubmit_lock);
@@ -601,17 +604,17 @@ static int pblk_submit_write(struct pblk *pblk)
 		 */
 		secs_avail = pblk_rb_read_count(&pblk->rwb);
 		if (!secs_avail)
-			return 1;
+			return 0;
 
 		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
 		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
-			return 1;
+			return 0;
 
 		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
 					secs_to_flush);
 		if (secs_to_sync > pblk->max_write_pgs) {
 			pblk_err(pblk, "bad buffer sync calculation\n");
-			return 1;
+			return 0;
 		}
 
 		secs_to_com = (secs_to_sync > secs_avail) ?
@@ -640,6 +643,7 @@ static int pblk_submit_write(struct pblk *pblk)
 	atomic_long_add(secs_to_sync, &pblk->sub_writes);
 #endif
 
+	*secs_left = 1;
 	return 0;
 
 fail_free_bio:
@@ -648,16 +652,22 @@ fail_put_bio:
 	bio_put(bio);
 	pblk_free_rqd(pblk, rqd, PBLK_WRITE);
 
-	return 1;
+	return -EINTR;
 }
 
 int pblk_write_ts(void *data)
 {
 	struct pblk *pblk = data;
+	int secs_left;
+	int write_failure = 0;
 
 	while (!kthread_should_stop()) {
-		if (!pblk_submit_write(pblk))
-			continue;
+		if (!write_failure) {
+			write_failure = pblk_submit_write(pblk, &secs_left);
+
+			if (secs_left)
+				continue;
+		}
 		set_current_state(TASK_INTERRUPTIBLE);
 		io_schedule();
 	}
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 02bb2e98f8a9..f415aae600c8 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -871,10 +871,10 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
 /*
  * pblk map
  */
-void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
+int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		       unsigned int sentry, unsigned long *lun_bitmap,
 		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
-void pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
+int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off);
 

From 3bcebc5bac0935d662f30d317e33ffa660bebf93 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:13 +0100
Subject: [PATCH 263/351] lightnvm: pblk: set conservative threshold for user
 writes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In a worst-case scenario (random writes), OP% of sectors
in each line will be invalid, and we will then need
to move data out of 100/OP% lines to free a single line.

So, to prevent the possibility of running out of lines,
temporarily block user writes when there is less than
100/OP% free lines.

Also ensure that pblk creation does not produce instances
with insufficient over provisioning.

Insufficient over-provising is not a problem on real hardware,
but often an issue when running QEMU simulations (with few lines).
100 lines is enough to create a sane instance with the standard
(11%) over provisioning.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 40 ++++++++++++++++++++++++++++--------
 drivers/lightnvm/pblk-rl.c   |  5 ++---
 drivers/lightnvm/pblk.h      | 12 ++++++++++-
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 13822594647c..f083130d9920 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -635,7 +635,7 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
 	return (lm->emeta_len[1] + lm->emeta_len[2] + lm->emeta_len[3]);
 }
 
-static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
+static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -643,23 +643,41 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 	struct nvm_geo *geo = &dev->geo;
 	sector_t provisioned;
 	int sec_meta, blk_meta;
+	int minimum;
 
 	if (geo->op == NVM_TARGET_DEFAULT_OP)
 		pblk->op = PBLK_DEFAULT_OP;
 	else
 		pblk->op = geo->op;
 
-	provisioned = nr_free_blks;
+	minimum = pblk_get_min_chks(pblk);
+	provisioned = nr_free_chks;
 	provisioned *= (100 - pblk->op);
 	sector_div(provisioned, 100);
 
-	pblk->op_blks = nr_free_blks - provisioned;
+	if ((nr_free_chks - provisioned) < minimum) {
+		if (geo->op != NVM_TARGET_DEFAULT_OP) {
+			pblk_err(pblk, "OP too small to create a sane instance\n");
+			return -EINTR;
+		}
+
+		/* If the user did not specify an OP value, and PBLK_DEFAULT_OP
+		 * is not enough, calculate and set sane value
+		 */
+
+		provisioned = nr_free_chks - minimum;
+		pblk->op =  (100 * minimum) / nr_free_chks;
+		pblk_info(pblk, "Default OP insufficient, adjusting OP to %d\n",
+				pblk->op);
+	}
+
+	pblk->op_blks = nr_free_chks - provisioned;
 
 	/* Internally pblk manages all free blocks, but all calculations based
 	 * on user capacity consider only provisioned blocks
 	 */
-	pblk->rl.total_blocks = nr_free_blks;
-	pblk->rl.nr_secs = nr_free_blks * geo->clba;
+	pblk->rl.total_blocks = nr_free_chks;
+	pblk->rl.nr_secs = nr_free_chks * geo->clba;
 
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
@@ -667,8 +685,10 @@ static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
 
 	pblk->capacity = (provisioned - blk_meta) * geo->clba;
 
-	atomic_set(&pblk->rl.free_blocks, nr_free_blks);
-	atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
+	atomic_set(&pblk->rl.free_blocks, nr_free_chks);
+	atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
+
+	return 0;
 }
 
 static int pblk_setup_line_meta_chk(struct pblk *pblk, struct pblk_line *line,
@@ -984,7 +1004,7 @@ static int pblk_lines_init(struct pblk *pblk)
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *line;
 	void *chunk_meta;
-	long nr_free_chks = 0;
+	int nr_free_chks = 0;
 	int i, ret;
 
 	ret = pblk_line_meta_init(pblk);
@@ -1031,7 +1051,9 @@ static int pblk_lines_init(struct pblk *pblk)
 		goto fail_free_lines;
 	}
 
-	pblk_set_provision(pblk, nr_free_chks);
+	ret = pblk_set_provision(pblk, nr_free_chks);
+	if (ret)
+		goto fail_free_lines;
 
 	vfree(chunk_meta);
 	return 0;
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index db55a1c89997..76116d5f78e4 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -214,11 +214,10 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line_meta *lm = &pblk->lm;
-	int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
 	int sec_meta, blk_meta;
-
 	unsigned int rb_windows;
 
+
 	/* Consider sectors used for metadata */
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
@@ -226,7 +225,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget)
 	rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
 	rl->high_pw = get_count_order(rl->high);
 
-	rl->rsv_blocks = min_blocks;
+	rl->rsv_blocks = pblk_get_min_chks(pblk);
 
 	/* This will always be a power-of-2 */
 	rb_windows = budget / NVM_MAX_VLBA;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index f415aae600c8..e5b88a25d4d6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -905,7 +905,6 @@ int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
 #define PBLK_GC_MAX_READERS 8	/* Max number of outstanding GC reader jobs */
 #define PBLK_GC_RQ_QD 128	/* Queue depth for inflight GC requests */
 #define PBLK_GC_L_QD 4		/* Queue depth for inflight GC lines */
-#define PBLK_GC_RSV_LINE 1	/* Reserved lines for GC */
 
 int pblk_gc_init(struct pblk *pblk);
 void pblk_gc_exit(struct pblk *pblk, bool graceful);
@@ -1370,4 +1369,15 @@ static inline char *pblk_disk_name(struct pblk *pblk)
 
 	return disk->disk_name;
 }
+
+static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
+{
+	struct pblk_line_meta *lm = &pblk->lm;
+	/* In a worst-case scenario every line will have OP invalid sectors.
+	 * We will then need a minimum of 1/OP lines to free up a single line
+	 */
+
+	return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
+
+}
 #endif /* PBLK_H_ */

From c9a1d640d519b40d00dac850d1f17a7df1954689 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:14 +0100
Subject: [PATCH 264/351] lightnvm: pblk: remove unused macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ADDR_POOL_SIZE is not used anymore, so remove the macro.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index f083130d9920..3219f335fce9 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -207,9 +207,6 @@ static int pblk_rwb_init(struct pblk *pblk)
 	return pblk_rb_init(&pblk->rwb, buffer_size, threshold, geo->csecs);
 }
 
-/* Minimum pages needed within a lun */
-#define ADDR_POOL_SIZE 64
-
 static int pblk_set_addrf_12(struct pblk *pblk, struct nvm_geo *geo,
 			     struct nvm_addrf_12 *dst)
 {

From 0934ce87b588a3da657b41804bf07518103875a4 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:15 +0100
Subject: [PATCH 265/351] lightnvm: pblk: fix pblk_lines_init error handling
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The chunk metadata is allocated with vmalloc, so we need to use
vfree to free it.

Fixes: 090ee26fd512 ("lightnvm: use internal allocation for chunk log page")
Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 3219f335fce9..0e37104de596 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1060,7 +1060,7 @@ fail_free_lines:
 		pblk_line_meta_free(l_mg, &pblk->lines[i]);
 	kfree(pblk->lines);
 fail_free_chunk_meta:
-	kfree(chunk_meta);
+	vfree(chunk_meta);
 fail_free_luns:
 	kfree(pblk->luns);
 fail_free_meta:

From e698d9f4e6254b838e4f1a3116ee069bbc378dc0 Mon Sep 17 00:00:00 2001
From: Hans Holmberg <hans.holmberg@cnexlabs.com>
Date: Tue, 11 Dec 2018 20:16:16 +0100
Subject: [PATCH 266/351] lightnvm: pblk: remove dead code in pblk_recov_l2p
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the call to pblk_line_replace_data as it returns
directly because we have not set l_mg->data_next yet.

Signed-off-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Reviewed-by: Javier González <javier@javigon.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-recovery.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 0fbd30e0a587..416d9840544b 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -805,7 +805,6 @@ next:
 		WARN_ON_ONCE(!test_and_clear_bit(meta_line,
 							&l_mg->meta_bitmap));
 		spin_unlock(&l_mg->free_lock);
-		pblk_line_replace_data(pblk);
 	} else {
 		spin_lock(&l_mg->free_lock);
 		/* Allocate next line for preparation */

From 6e82f0ba00b0addeec27b3a95eb41e6223fc8c4f Mon Sep 17 00:00:00 2001
From: Hua Su <suhua.tanke@gmail.com>
Date: Tue, 11 Dec 2018 20:16:17 +0100
Subject: [PATCH 267/351] lightnvm: pblk: fix spelling in comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Hua Su <suhua.tanke@gmail.com>
Updated description.
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-rb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b1f4b51783f4..9f7fa0fe9c77 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -147,7 +147,7 @@ int pblk_rb_init(struct pblk_rb *rb, unsigned int size, unsigned int threshold,
 
 	/*
 	 * Initialize rate-limiter, which controls access to the write buffer
-	 * but user and GC I/O
+	 * by user and GC I/O
 	 */
 	pblk_rl_init(&pblk->rl, rb->nr_entries);
 

From fde201a466c6ad5efd72cb54fdf2cefa8b6c6ad7 Mon Sep 17 00:00:00 2001
From: Hua Su <suhua.tanke@gmail.com>
Date: Tue, 11 Dec 2018 20:16:18 +0100
Subject: [PATCH 268/351] lightnvm: pblk: add lock protection to list
 operations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Protect the list_add on the pblk_line_init_bb() error
path in case this code is used for some other purpose
in the future.

Signed-off-by: Hua Su <suhua.tanke@gmail.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 6581c35f51ee..44c5dc046912 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1295,15 +1295,22 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 
 	ret = pblk_line_alloc_bitmaps(pblk, line);
 	if (ret)
-		return ret;
+		goto fail;
 
 	if (!pblk_line_init_bb(pblk, line, 0)) {
-		list_add(&line->list, &l_mg->free_list);
-		return -EINTR;
+		ret = -EINTR;
+		goto fail;
 	}
 
 	pblk_rl_free_lines_dec(&pblk->rl, line, true);
 	return 0;
+
+fail:
+	spin_lock(&l_mg->free_lock);
+	list_add(&line->list, &l_mg->free_list);
+	spin_unlock(&l_mg->free_lock);
+
+	return ret;
 }
 
 void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line)

From 361d889f830ef61e4eae442c4c89fb14b626375f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <javier@javigon.com>
Date: Tue, 11 Dec 2018 20:16:19 +0100
Subject: [PATCH 269/351] lightnvm: pblk: add comments wrt locking in recovery
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk's recovery path is single threaded and therefore a number of
assumptions regarding concurrency can be made. To avoid confusion, make
this explicit with a couple of comments in the code.

Signed-off-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 1 +
 drivers/lightnvm/pblk-recovery.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 44c5dc046912..f1b411e7c7c9 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1276,6 +1276,7 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line)
 	return 0;
 }
 
+/* Line allocations in the recovery path are always single threaded */
 int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 416d9840544b..4c726506a831 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -13,6 +13,9 @@
  * General Public License for more details.
  *
  * pblk-recovery.c - pblk's recovery path
+ *
+ * The L2P recovery path is single threaded as the L2P table is updated in order
+ * following the line sequence ID.
  */
 
 #include "pblk.h"

From 85136c0102852fe505c0fbd3f1bf9d17038bb94d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <mb@lightnvm.io>
Date: Tue, 11 Dec 2018 20:16:20 +0100
Subject: [PATCH 270/351] lightnvm: simplify geometry enumeration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the geometry of an OCSSD is enumerated using a two step
approach:

First, nvm_register is called, the OCSSD identify command is issued,
and second the geometry sos and csecs values are read either from the
OCSSD identify if it is a 1.2 drive, or from the NVMe namespace data
structure if it is a 2.0 device.

This patch recombines it into a single step, such that nvm_register can
use the csecs and sos fields independent of which version is used. This
enables one to dynamically size the lightnvm subsystem dma pool.

Reviewed-by: Igor Konopko <igor.j.konopko@intel.com>
Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c      | 12 +++++-------
 drivers/nvme/host/core.c     | 18 +++++++++---------
 drivers/nvme/host/lightnvm.c | 18 ++++++------------
 drivers/nvme/host/nvme.h     |  2 --
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 10e541cb8dc3..69b841d682c7 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -1145,25 +1145,23 @@ int nvm_register(struct nvm_dev *dev)
 	if (!dev->q || !dev->ops)
 		return -EINVAL;
 
+	ret = nvm_init(dev);
+	if (ret)
+		return ret;
+
 	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
 	if (!dev->dma_pool) {
 		pr_err("nvm: could not create dma pool\n");
+		nvm_free(dev);
 		return -ENOMEM;
 	}
 
-	ret = nvm_init(dev);
-	if (ret)
-		goto err_init;
-
 	/* register device with a supported media manager */
 	down_write(&nvm_lock);
 	list_add(&dev->devices, &nvm_devices);
 	up_write(&nvm_lock);
 
 	return 0;
-err_init:
-	dev->ops->destroy_dma_pool(dev->dma_pool);
-	return ret;
 }
 EXPORT_SYMBOL(nvm_register);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1310753a01e5..c71e879821ad 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1549,8 +1549,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
 	if (ns->noiob)
 		nvme_set_chunk_size(ns);
 	nvme_update_disk_info(disk, ns, id);
-	if (ns->ndev)
-		nvme_nvm_update_nvm_info(ns);
 #ifdef CONFIG_NVME_MULTIPATH
 	if (ns->head->disk) {
 		nvme_update_disk_info(ns->head->disk, ns, id);
@@ -3156,13 +3154,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	nvme_setup_streams_ns(ctrl, ns);
 	nvme_set_disk_name(disk_name, ns, ctrl, &flags);
 
-	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
-		if (nvme_nvm_register(ns, disk_name, node)) {
-			dev_warn(ctrl->device, "LightNVM init failure\n");
-			goto out_unlink_ns;
-		}
-	}
-
 	disk = alloc_disk_node(0, node);
 	if (!disk)
 		goto out_unlink_ns;
@@ -3176,6 +3167,13 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 
 	__nvme_revalidate_disk(disk, id);
 
+	if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
+		if (nvme_nvm_register(ns, disk_name, node)) {
+			dev_warn(ctrl->device, "LightNVM init failure\n");
+			goto out_put_disk;
+		}
+	}
+
 	down_write(&ctrl->namespaces_rwsem);
 	list_add_tail(&ns->list, &ctrl->namespaces);
 	up_write(&ctrl->namespaces_rwsem);
@@ -3189,6 +3187,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	kfree(id);
 
 	return;
+ out_put_disk:
+	put_disk(ns->disk);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index d64805dc8efb..51d957ccf328 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -973,22 +973,11 @@ int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg)
 	}
 }
 
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns)
-{
-	struct nvm_dev *ndev = ns->ndev;
-	struct nvm_geo *geo = &ndev->geo;
-
-	if (geo->version == NVM_OCSSD_SPEC_12)
-		return;
-
-	geo->csecs = 1 << ns->lba_shift;
-	geo->sos = ns->ms;
-}
-
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 {
 	struct request_queue *q = ns->queue;
 	struct nvm_dev *dev;
+	struct nvm_geo *geo;
 
 	_nvme_nvm_check_size();
 
@@ -996,6 +985,11 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	if (!dev)
 		return -ENOMEM;
 
+	/* Note that csecs and sos will be overridden if it is a 1.2 drive. */
+	geo = &dev->geo;
+	geo->csecs = 1 << ns->lba_shift;
+	geo->sos = ns->ms;
+
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 8e0ec365ce8d..e20e737ac10c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -540,13 +540,11 @@ static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 #endif /* CONFIG_NVME_MULTIPATH */
 
 #ifdef CONFIG_NVM
-void nvme_nvm_update_nvm_info(struct nvme_ns *ns);
 int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node);
 void nvme_nvm_unregister(struct nvme_ns *ns);
 extern const struct attribute_group nvme_nvm_attr_group;
 int nvme_nvm_ioctl(struct nvme_ns *ns, unsigned int cmd, unsigned long arg);
 #else
-static inline void nvme_nvm_update_nvm_info(struct nvme_ns *ns) {};
 static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name,
 				    int node)
 {

From 42bd0384d77ef7552954056928018f5cfa91a013 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= <javier@javigon.com>
Date: Tue, 11 Dec 2018 20:16:21 +0100
Subject: [PATCH 271/351] lightnvm: pblk: avoid ref warning on cache creation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current kref implementation around pblk global caches triggers a
false positive on refcount_inc_checked() (when called) as the kref is
initialized to 0. Instead of usint kref_inc() on a 0 reference, which is
in principle correct, use kref_init() to avoid the check. This is also
more explicit about what actually happens on cache creation.

In the process, do a small refactoring to use kref helpers.

Fixes: 1864de94ec9d6 "lightnvm: pblk: stop recreating global caches"
Signed-off-by: Javier González <javier@cnexlabs.com>
Reviewed-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 0e37104de596..72ad3e70318c 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -347,23 +347,19 @@ fail_destroy_ws:
 
 static int pblk_get_global_caches(void)
 {
-	int ret;
+	int ret = 0;
 
 	mutex_lock(&pblk_caches.mutex);
 
-	if (kref_read(&pblk_caches.kref) > 0) {
-		kref_get(&pblk_caches.kref);
-		mutex_unlock(&pblk_caches.mutex);
-		return 0;
-	}
+	if (kref_get_unless_zero(&pblk_caches.kref))
+		goto out;
 
 	ret = pblk_create_global_caches();
-
 	if (!ret)
-		kref_get(&pblk_caches.kref);
+		kref_init(&pblk_caches.kref);
 
+out:
 	mutex_unlock(&pblk_caches.mutex);
-
 	return ret;
 }
 

From dd439496dfbcfee1eb1e0d14984f98acb2b84c16 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:22 +0100
Subject: [PATCH 272/351] lightnvm: pblk: move lba list to partial read context
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently DMA allocated memory is reused on partial read
for lba_list_mem and lba_list_media arrays. In preparation
for dynamic DMA pool sizes we need to move this arrays
into pblk_pr_ctx structures.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-read.c | 20 +++++---------------
 drivers/lightnvm/pblk.h      |  2 ++
 2 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 9fba614adeeb..19917d3c19b3 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -224,7 +224,6 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	unsigned long *read_bitmap = pr_ctx->bitmap;
 	int nr_secs = pr_ctx->orig_nr_secs;
 	int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs);
-	__le64 *lba_list_mem, *lba_list_media;
 	void *src_p, *dst_p;
 	int hole, i;
 
@@ -237,13 +236,9 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		rqd->ppa_list[0] = ppa;
 	}
 
-	/* Re-use allocated memory for intermediate lbas */
-	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-	lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size);
-
 	for (i = 0; i < nr_secs; i++) {
-		lba_list_media[i] = meta_list[i].lba;
-		meta_list[i].lba = lba_list_mem[i];
+		pr_ctx->lba_list_media[i] = meta_list[i].lba;
+		meta_list[i].lba = pr_ctx->lba_list_mem[i];
 	}
 
 	/* Fill the holes in the original bio */
@@ -255,7 +250,7 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
 		kref_put(&line->ref, pblk_line_put);
 
-		meta_list[hole].lba = lba_list_media[i];
+		meta_list[hole].lba = pr_ctx->lba_list_media[i];
 
 		src_bv = new_bio->bi_io_vec[i++];
 		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -295,13 +290,9 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx;
 	struct bio *new_bio, *bio = r_ctx->private;
-	__le64 *lba_list_mem;
 	int nr_secs = rqd->nr_ppas;
 	int i;
 
-	/* Re-use allocated memory for intermediate lbas */
-	lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size);
-
 	new_bio = bio_alloc(GFP_KERNEL, nr_holes);
 
 	if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes))
@@ -312,12 +303,12 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 		goto fail_free_pages;
 	}
 
-	pr_ctx = kmalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
+	pr_ctx = kzalloc(sizeof(struct pblk_pr_ctx), GFP_KERNEL);
 	if (!pr_ctx)
 		goto fail_free_pages;
 
 	for (i = 0; i < nr_secs; i++)
-		lba_list_mem[i] = meta_list[i].lba;
+		pr_ctx->lba_list_mem[i] = meta_list[i].lba;
 
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -325,7 +316,6 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	rqd->bio = new_bio;
 	rqd->nr_ppas = nr_holes;
 
-	pr_ctx->ppa_ptr = NULL;
 	pr_ctx->orig_bio = bio;
 	bitmap_copy(pr_ctx->bitmap, read_bitmap, NVM_MAX_VLBA);
 	pr_ctx->bio_init_idx = bio_init_idx;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index e5b88a25d4d6..0e9d3960ac4c 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -132,6 +132,8 @@ struct pblk_pr_ctx {
 	unsigned int bio_init_idx;
 	void *ppa_ptr;
 	dma_addr_t dma_ppa_list;
+	__le64 lba_list_mem[NVM_MAX_VLBA];
+	__le64 lba_list_media[NVM_MAX_VLBA];
 };
 
 /* Pad context */

From faa79f27f0a46cd6c3ac3de5e7f3e142598217fc Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:23 +0100
Subject: [PATCH 273/351] lightnvm: pblk: add helpers for OOB metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk currently assumes that size of OOB metadata on drive is always
equal to size of pblk_sec_meta struct. This commit add helpers which will
allow to handle different sizes of OOB metadata on drive in the future.

After this patch only OOB metadata equal to 16 bytes is supported.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     |  5 ++--
 drivers/lightnvm/pblk-init.c     |  6 ++++
 drivers/lightnvm/pblk-map.c      | 20 ++++++++-----
 drivers/lightnvm/pblk-read.c     | 48 +++++++++++++++++++++-----------
 drivers/lightnvm/pblk-recovery.c | 16 +++++++----
 drivers/lightnvm/pblk.h          |  6 ++++
 6 files changed, 69 insertions(+), 32 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index f1b411e7c7c9..e732b2d12a23 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -796,10 +796,11 @@ static int pblk_line_smeta_write(struct pblk *pblk, struct pblk_line *line,
 	rqd.is_seq = 1;
 
 	for (i = 0; i < lm->smeta_sec; i++, paddr++) {
-		struct pblk_sec_meta *meta_list = rqd.meta_list;
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+							   rqd.meta_list, i);
 
 		rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id);
-		meta_list[i].lba = lba_list[paddr] = addr_empty;
+		meta->lba = lba_list[paddr] = addr_empty;
 	}
 
 	ret = pblk_submit_io_sync_sem(pblk, &rqd);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 72ad3e70318c..33361bfb85c3 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -405,6 +405,12 @@ static int pblk_core_init(struct pblk *pblk)
 		queue_max_hw_sectors(dev->q) / (geo->csecs >> SECTOR_SHIFT));
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
+	pblk->oob_meta_size = geo->sos;
+	if (pblk->oob_meta_size != sizeof(struct pblk_sec_meta)) {
+		pblk_err(pblk, "Unsupported metadata size\n");
+		return -EINVAL;
+	}
+
 	pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
 								GFP_KERNEL);
 	if (!pblk->pad_dist)
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 5a3c28cce8ab..81e503ec384e 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -22,7 +22,7 @@
 static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			      struct ppa_addr *ppa_list,
 			      unsigned long *lun_bitmap,
-			      struct pblk_sec_meta *meta_list,
+			      void *meta_list,
 			      unsigned int valid_secs)
 {
 	struct pblk_line *line = pblk_line_get_data(pblk);
@@ -58,6 +58,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 	paddr = pblk_alloc_page(pblk, line, nr_secs);
 
 	for (i = 0; i < nr_secs; i++, paddr++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
 		/* ppa to be sent to the device */
@@ -74,14 +75,15 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			kref_get(&line->ref);
 			w_ctx = pblk_rb_w_ctx(&pblk->rwb, sentry + i);
 			w_ctx->ppa = ppa_list[i];
-			meta_list[i].lba = cpu_to_le64(w_ctx->lba);
+			meta->lba = cpu_to_le64(w_ctx->lba);
 			lba_list[paddr] = cpu_to_le64(w_ctx->lba);
 			if (lba_list[paddr] != addr_empty)
 				line->nr_valid_lbas++;
 			else
 				atomic64_inc(&pblk->pad_wa);
 		} else {
-			lba_list[paddr] = meta_list[i].lba = addr_empty;
+			lba_list[paddr] = addr_empty;
+			meta->lba = addr_empty;
 			__pblk_map_invalidate(pblk, line, paddr);
 		}
 	}
@@ -94,7 +96,8 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
+	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	unsigned int map_secs;
 	int min = pblk->min_write_pgs;
@@ -103,9 +106,10 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 
 	for (i = off; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+					lun_bitmap, meta_buffer, map_secs);
 		if (ret)
 			return ret;
 	}
@@ -121,7 +125,8 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
+	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	struct pblk_line *e_line, *d_line;
 	unsigned int map_secs;
@@ -132,9 +137,10 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < rqd->nr_ppas; i += min) {
 		map_secs = (i + min > valid_secs) ? (valid_secs % min) : min;
+		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, &meta_list[i], map_secs);
+					lun_bitmap, meta_buffer, map_secs);
 		if (ret)
 			return ret;
 
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 19917d3c19b3..6becd85ca4c6 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -43,7 +43,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 				 struct bio *bio, sector_t blba,
 				 unsigned long *read_bitmap)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	struct ppa_addr ppas[NVM_MAX_VLBA];
 	int nr_secs = rqd->nr_ppas;
 	bool advanced_bio = false;
@@ -53,12 +53,15 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	for (i = 0; i < nr_secs; i++) {
 		struct ppa_addr p = ppas[i];
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		sector_t lba = blba + i;
 
 retry:
 		if (pblk_ppa_empty(p)) {
+			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			meta_list[i].lba = cpu_to_le64(ADDR_EMPTY);
+			meta->lba = addr_empty;
 
 			if (unlikely(!advanced_bio)) {
 				bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE);
@@ -78,7 +81,7 @@ retry:
 				goto retry;
 			}
 			WARN_ON(test_and_set_bit(i, read_bitmap));
-			meta_list[i].lba = cpu_to_le64(lba);
+			meta->lba = cpu_to_le64(lba);
 			advanced_bio = true;
 #ifdef CONFIG_NVM_PBLK_DEBUG
 			atomic_long_inc(&pblk->cache_reads);
@@ -105,12 +108,13 @@ next:
 static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 				sector_t blba)
 {
-	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	int nr_lbas = rqd->nr_ppas;
 	int i;
 
 	for (i = 0; i < nr_lbas; i++) {
-		u64 lba = le64_to_cpu(meta_lba_list[i].lba);
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+		u64 lba = le64_to_cpu(meta->lba);
 
 		if (lba == ADDR_EMPTY)
 			continue;
@@ -134,17 +138,19 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
 				 u64 *lba_list, int nr_lbas)
 {
-	struct pblk_sec_meta *meta_lba_list = rqd->meta_list;
+	void *meta_lba_list = rqd->meta_list;
 	int i, j;
 
 	for (i = 0, j = 0; i < nr_lbas; i++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
+							   meta_lba_list, j);
 		u64 lba = lba_list[i];
 		u64 meta_lba;
 
 		if (lba == ADDR_EMPTY)
 			continue;
 
-		meta_lba = le64_to_cpu(meta_lba_list[j].lba);
+		meta_lba = le64_to_cpu(meta->lba);
 
 		if (lba != meta_lba) {
 #ifdef CONFIG_NVM_PBLK_DEBUG
@@ -216,10 +222,11 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	struct pblk *pblk = rqd->private;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx = r_ctx->private;
+	struct pblk_sec_meta *meta;
 	struct bio *new_bio = rqd->bio;
 	struct bio *bio = pr_ctx->orig_bio;
 	struct bio_vec src_bv, dst_bv;
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	int bio_init_idx = pr_ctx->bio_init_idx;
 	unsigned long *read_bitmap = pr_ctx->bitmap;
 	int nr_secs = pr_ctx->orig_nr_secs;
@@ -237,8 +244,9 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 	}
 
 	for (i = 0; i < nr_secs; i++) {
-		pr_ctx->lba_list_media[i] = meta_list[i].lba;
-		meta_list[i].lba = pr_ctx->lba_list_mem[i];
+		meta = pblk_get_meta(pblk, meta_list, i);
+		pr_ctx->lba_list_media[i] = le64_to_cpu(meta->lba);
+		meta->lba = cpu_to_le64(pr_ctx->lba_list_mem[i]);
 	}
 
 	/* Fill the holes in the original bio */
@@ -250,7 +258,8 @@ static void pblk_end_partial_read(struct nvm_rq *rqd)
 		line = pblk_ppa_to_line(pblk, rqd->ppa_list[i]);
 		kref_put(&line->ref, pblk_line_put);
 
-		meta_list[hole].lba = pr_ctx->lba_list_media[i];
+		meta = pblk_get_meta(pblk, meta_list, hole);
+		meta->lba = cpu_to_le64(pr_ctx->lba_list_media[i]);
 
 		src_bv = new_bio->bi_io_vec[i++];
 		dst_bv = bio->bi_io_vec[bio_init_idx + hole];
@@ -286,7 +295,7 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 			    unsigned long *read_bitmap,
 			    int nr_holes)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	void *meta_list = rqd->meta_list;
 	struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
 	struct pblk_pr_ctx *pr_ctx;
 	struct bio *new_bio, *bio = r_ctx->private;
@@ -307,8 +316,11 @@ static int pblk_setup_partial_read(struct pblk *pblk, struct nvm_rq *rqd,
 	if (!pr_ctx)
 		goto fail_free_pages;
 
-	for (i = 0; i < nr_secs; i++)
-		pr_ctx->lba_list_mem[i] = meta_list[i].lba;
+	for (i = 0; i < nr_secs; i++) {
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+
+		pr_ctx->lba_list_mem[i] = le64_to_cpu(meta->lba);
+	}
 
 	new_bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(new_bio, REQ_OP_READ, 0);
@@ -373,7 +385,7 @@ err:
 static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 			 sector_t lba, unsigned long *read_bitmap)
 {
-	struct pblk_sec_meta *meta_list = rqd->meta_list;
+	struct pblk_sec_meta *meta = pblk_get_meta(pblk, rqd->meta_list, 0);
 	struct ppa_addr ppa;
 
 	pblk_lookup_l2p_seq(pblk, &ppa, lba, 1);
@@ -384,8 +396,10 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, struct bio *bio,
 
 retry:
 	if (pblk_ppa_empty(ppa)) {
+		__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
+
 		WARN_ON(test_and_set_bit(0, read_bitmap));
-		meta_list[0].lba = cpu_to_le64(ADDR_EMPTY);
+		meta->lba = addr_empty;
 		return;
 	}
 
@@ -399,7 +413,7 @@ retry:
 		}
 
 		WARN_ON(test_and_set_bit(0, read_bitmap));
-		meta_list[0].lba = cpu_to_le64(lba);
+		meta->lba = cpu_to_le64(lba);
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
 		atomic_long_inc(&pblk->cache_reads);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 4c726506a831..e4dd634ba05f 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -127,7 +127,7 @@ static u64 pblk_sec_in_open_line(struct pblk *pblk, struct pblk_line *line)
 
 struct pblk_recov_alloc {
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct nvm_rq *rqd;
 	void *data;
 	dma_addr_t dma_ppa_list;
@@ -161,7 +161,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct pblk_pad_rq *pad_rq;
 	struct nvm_rq *rqd;
 	struct bio *bio;
@@ -240,12 +240,15 @@ next_pad_rq:
 
 		for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++) {
 			struct ppa_addr dev_ppa;
+			struct pblk_sec_meta *meta;
 			__le64 addr_empty = cpu_to_le64(ADDR_EMPTY);
 
 			dev_ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
 
 			pblk_map_invalidate(pblk, dev_ppa);
-			lba_list[w_ptr] = meta_list[i].lba = addr_empty;
+			lba_list[w_ptr] = addr_empty;
+			meta = pblk_get_meta(pblk, meta_list, i);
+			meta->lba = addr_empty;
 			rqd->ppa_list[i] = dev_ppa;
 		}
 	}
@@ -340,7 +343,7 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct nvm_rq *rqd;
 	struct bio *bio;
 	void *data;
@@ -438,7 +441,8 @@ retry_rq:
 	}
 
 	for (i = 0; i < rqd->nr_ppas; i++) {
-		u64 lba = le64_to_cpu(meta_list[i].lba);
+		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
+		u64 lba = le64_to_cpu(meta->lba);
 
 		lba_list[paddr++] = cpu_to_le64(lba);
 
@@ -467,7 +471,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	struct nvm_geo *geo = &dev->geo;
 	struct nvm_rq *rqd;
 	struct ppa_addr *ppa_list;
-	struct pblk_sec_meta *meta_list;
+	void *meta_list;
 	struct pblk_recov_alloc p;
 	void *data;
 	dma_addr_t dma_ppa_list, dma_meta_list;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 0e9d3960ac4c..80f356688803 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -634,6 +634,7 @@ struct pblk {
 
 	int min_write_pgs; /* Minimum amount of pages required by controller */
 	int max_write_pgs; /* Maximum amount of pages supported by controller */
+	int oob_meta_size; /* Size of OOB sector metadata */
 
 	sector_t capacity; /* Device capacity when bad blocks are subtracted */
 
@@ -1380,6 +1381,11 @@ static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
 	 */
 
 	return DIV_ROUND_UP(100, pblk->op) * lm->blk_per_line;
+}
 
+static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
+							 void *meta, int index)
+{
+	return meta + pblk->oob_meta_size * index;
 }
 #endif /* PBLK_H_ */

From 24828d0536bbedc9b265f2b01ffca99de3f6a7c7 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:24 +0100
Subject: [PATCH 274/351] lightnvm: dynamic DMA pool entry size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently lightnvm and pblk uses single DMA pool, for which the entry
size always is equal to PAGE_SIZE. The contents of each entry allocated
from the DMA pool consists of a PPA list (8bytes * 64), leaving
56bytes * 64 space for metadata. Since the metadata field can be bigger,
such as 128 bytes, the static size does not cover this use-case.

This patch adds support for I/O metadata above 56 bytes by changing DMA
pool size based on device meta size and allows pblk to use OOB metadata
>=16B.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/core.c          | 9 +++++++--
 drivers/lightnvm/pblk-core.c     | 8 ++++----
 drivers/lightnvm/pblk-init.c     | 2 +-
 drivers/lightnvm/pblk-recovery.c | 4 ++--
 drivers/lightnvm/pblk.h          | 6 +++++-
 drivers/nvme/host/lightnvm.c     | 5 +++--
 include/linux/lightnvm.h         | 2 +-
 7 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 69b841d682c7..5f82036fe322 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -1140,7 +1140,7 @@ EXPORT_SYMBOL(nvm_alloc_dev);
 
 int nvm_register(struct nvm_dev *dev)
 {
-	int ret;
+	int ret, exp_pool_size;
 
 	if (!dev->q || !dev->ops)
 		return -EINVAL;
@@ -1149,7 +1149,12 @@ int nvm_register(struct nvm_dev *dev)
 	if (ret)
 		return ret;
 
-	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist");
+	exp_pool_size = max_t(int, PAGE_SIZE,
+			      (NVM_MAX_VLBA * (sizeof(u64) + dev->geo.sos)));
+	exp_pool_size = round_up(exp_pool_size, PAGE_SIZE);
+
+	dev->dma_pool = dev->ops->create_dma_pool(dev, "ppalist",
+						  exp_pool_size);
 	if (!dev->dma_pool) {
 		pr_err("nvm: could not create dma pool\n");
 		nvm_free(dev);
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index e732b2d12a23..7e3397f8ead1 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -250,8 +250,8 @@ int pblk_alloc_rqd_meta(struct pblk *pblk, struct nvm_rq *rqd)
 	if (rqd->nr_ppas == 1)
 		return 0;
 
-	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size;
-	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size;
+	rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size(pblk);
+	rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size(pblk);
 
 	return 0;
 }
@@ -846,8 +846,8 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
 	if (!meta_list)
 		return -ENOMEM;
 
-	ppa_list = meta_list + pblk_dma_meta_size;
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+	ppa_list = meta_list + pblk_dma_meta_size(pblk);
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
 
 next_rq:
 	memset(&rqd, 0, sizeof(struct nvm_rq));
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 33361bfb85c3..ff6a6df369c3 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -406,7 +406,7 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
 	pblk->oob_meta_size = geo->sos;
-	if (pblk->oob_meta_size != sizeof(struct pblk_sec_meta)) {
+	if (pblk->oob_meta_size < sizeof(struct pblk_sec_meta)) {
 		pblk_err(pblk, "Unsupported metadata size\n");
 		return -EINVAL;
 	}
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index e4dd634ba05f..3a775d10f616 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -481,8 +481,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line)
 	if (!meta_list)
 		return -ENOMEM;
 
-	ppa_list = (void *)(meta_list) + pblk_dma_meta_size;
-	dma_ppa_list = dma_meta_list + pblk_dma_meta_size;
+	ppa_list = (void *)(meta_list) + pblk_dma_meta_size(pblk);
+	dma_ppa_list = dma_meta_list + pblk_dma_meta_size(pblk);
 
 	data = kcalloc(pblk->max_write_pgs, geo->csecs, GFP_KERNEL);
 	if (!data) {
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 80f356688803..9087d53d5c25 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -104,7 +104,6 @@ enum {
 	PBLK_RL_LOW = 4
 };
 
-#define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * NVM_MAX_VLBA)
 #define pblk_dma_ppa_size (sizeof(u64) * NVM_MAX_VLBA)
 
 /* write buffer completion context */
@@ -1388,4 +1387,9 @@ static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
 {
 	return meta + pblk->oob_meta_size * index;
 }
+
+static inline int pblk_dma_meta_size(struct pblk *pblk)
+{
+	return pblk->oob_meta_size * NVM_MAX_VLBA;
+}
 #endif /* PBLK_H_ */
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 51d957ccf328..ba268d7cf141 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -732,11 +732,12 @@ static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd)
 	return ret;
 }
 
-static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name)
+static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name,
+					int size)
 {
 	struct nvme_ns *ns = nvmdev->q->queuedata;
 
-	return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0);
+	return dma_pool_create(name, ns->ctrl->dev, size, PAGE_SIZE, 0);
 }
 
 static void nvme_nvm_destroy_dma_pool(void *pool)
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 2fdeac1a420d..7afedaddbd15 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -90,7 +90,7 @@ typedef int (nvm_get_chk_meta_fn)(struct nvm_dev *, sector_t, int,
 							struct nvm_chk_meta *);
 typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *);
 typedef int (nvm_submit_io_sync_fn)(struct nvm_dev *, struct nvm_rq *);
-typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *);
+typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *, int);
 typedef void (nvm_destroy_dma_pool_fn)(void *);
 typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t,
 								dma_addr_t *);

From a16816b9e462e8ee86a908606bde54b53cfeca80 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:25 +0100
Subject: [PATCH 275/351] lightnvm: disable interleaved metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently pblk only check the size of I/O metadata and does not take
into account if this metadata is in a separate buffer or interleaved
in a single metadata buffer.

In reality only the first scenario is supported, where second mode will
break pblk functionality during any IO operation.

This patch prevents pblk to be instantiated in case device only
supports interleaved metadata.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-init.c | 6 ++++++
 drivers/nvme/host/lightnvm.c | 1 +
 include/linux/lightnvm.h     | 1 +
 3 files changed, 8 insertions(+)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index ff6a6df369c3..e8055b796381 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -1175,6 +1175,12 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
 		return ERR_PTR(-EINVAL);
 	}
 
+	if (geo->ext) {
+		pblk_err(pblk, "extended metadata not supported\n");
+		kfree(pblk);
+		return ERR_PTR(-EINVAL);
+	}
+
 	spin_lock_init(&pblk->resubmit_lock);
 	spin_lock_init(&pblk->trans_lock);
 	spin_lock_init(&pblk->lock);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba268d7cf141..f145fc0220d6 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -990,6 +990,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	geo = &dev->geo;
 	geo->csecs = 1 << ns->lba_shift;
 	geo->sos = ns->ms;
+	geo->ext = ns->ext;
 
 	dev->q = q;
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 7afedaddbd15..5d865a5d5cdc 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -357,6 +357,7 @@ struct nvm_geo {
 	u32	clba;		/* sectors per chunk */
 	u16	csecs;		/* sector size */
 	u16	sos;		/* out-of-band area size */
+	bool	ext;		/* metadata in extended data buffer */
 
 	/* device write constrains */
 	u32	ws_min;		/* minimum write size */

From 55d8ec35398e7ab001989473cf6ed6f40b5ef4a6 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:26 +0100
Subject: [PATCH 276/351] lightnvm: pblk: support packed metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pblk performs recovery of open lines by storing the LBA in the per LBA
metadata field. Recovery therefore only works for drives that has this
field.

This patch adds support for packed metadata, which store l2p mapping
for open lines in last sector of every write unit and enables drives
without per IO metadata to recover open lines.

After this patch, drives with OOB size <16B will use packed metadata
and metadata size larger than16B will continue to use the device per
IO metadata.

Reviewed-by: Javier González <javier@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk-core.c     | 48 +++++++++++++++++++++++++++++---
 drivers/lightnvm/pblk-init.c     | 38 +++++++++++++++++++++----
 drivers/lightnvm/pblk-map.c      |  4 +--
 drivers/lightnvm/pblk-rb.c       |  3 ++
 drivers/lightnvm/pblk-read.c     |  6 ++++
 drivers/lightnvm/pblk-recovery.c | 17 ++++++++---
 drivers/lightnvm/pblk-sysfs.c    |  7 +++++
 drivers/lightnvm/pblk-write.c    |  9 +++---
 drivers/lightnvm/pblk.h          | 10 ++++++-
 9 files changed, 122 insertions(+), 20 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 7e3397f8ead1..1ff165351180 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -376,7 +376,7 @@ void pblk_write_should_kick(struct pblk *pblk)
 {
 	unsigned int secs_avail = pblk_rb_read_count(&pblk->rwb);
 
-	if (secs_avail >= pblk->min_write_pgs)
+	if (secs_avail >= pblk->min_write_pgs_data)
 		pblk_write_kick(pblk);
 }
 
@@ -407,7 +407,9 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct list_head *move_list = NULL;
-	int vsc = le32_to_cpu(*line->vsc);
+	int packed_meta = (le32_to_cpu(*line->vsc) / pblk->min_write_pgs_data)
+			* (pblk->min_write_pgs - pblk->min_write_pgs_data);
+	int vsc = le32_to_cpu(*line->vsc) + packed_meta;
 
 	lockdep_assert_held(&line->lock);
 
@@ -620,12 +622,15 @@ out:
 }
 
 int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush)
+		   unsigned long secs_to_flush, bool skip_meta)
 {
 	int max = pblk->sec_per_write;
 	int min = pblk->min_write_pgs;
 	int secs_to_sync = 0;
 
+	if (skip_meta && pblk->min_write_pgs_data != pblk->min_write_pgs)
+		min = max = pblk->min_write_pgs_data;
+
 	if (secs_avail >= max)
 		secs_to_sync = max;
 	else if (secs_avail >= min)
@@ -852,7 +857,7 @@ int pblk_line_emeta_read(struct pblk *pblk, struct pblk_line *line,
 next_rq:
 	memset(&rqd, 0, sizeof(struct nvm_rq));
 
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	rq_len = rq_ppas * geo->csecs;
 
 	bio = pblk_bio_map_addr(pblk, emeta_buf, rq_ppas, rq_len,
@@ -2169,3 +2174,38 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 	}
 	spin_unlock(&pblk->trans_lock);
 }
+
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	void *buffer;
+
+	if (pblk_is_oob_meta_supported(pblk)) {
+		/* Just use OOB metadata buffer as always */
+		buffer = rqd->meta_list;
+	} else {
+		/* We need to reuse last page of request (packed metadata)
+		 * in similar way as traditional oob metadata
+		 */
+		buffer = page_to_virt(
+			rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+	}
+
+	return buffer;
+}
+
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd)
+{
+	void *meta_list = rqd->meta_list;
+	void *page;
+	int i = 0;
+
+	if (pblk_is_oob_meta_supported(pblk))
+		return;
+
+	page = page_to_virt(rqd->bio->bi_io_vec[rqd->bio->bi_vcnt - 1].bv_page);
+	/* We need to fill oob meta buffer with data from packed metadata */
+	for (; i < rqd->nr_ppas; i++)
+		memcpy(pblk_get_meta(pblk, meta_list, i),
+			page + (i * sizeof(struct pblk_sec_meta)),
+			sizeof(struct pblk_sec_meta));
+}
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index e8055b796381..f9a3e47b6a93 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -399,6 +399,7 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk->nr_flush_rst = 0;
 
 	pblk->min_write_pgs = geo->ws_opt;
+	pblk->min_write_pgs_data = pblk->min_write_pgs;
 	max_write_ppas = pblk->min_write_pgs * geo->all_luns;
 	pblk->max_write_pgs = min_t(int, max_write_ppas, NVM_MAX_VLBA);
 	pblk->max_write_pgs = min_t(int, pblk->max_write_pgs,
@@ -406,9 +407,35 @@ static int pblk_core_init(struct pblk *pblk)
 	pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
 
 	pblk->oob_meta_size = geo->sos;
-	if (pblk->oob_meta_size < sizeof(struct pblk_sec_meta)) {
-		pblk_err(pblk, "Unsupported metadata size\n");
-		return -EINVAL;
+	if (!pblk_is_oob_meta_supported(pblk)) {
+		/* For drives which does not have OOB metadata feature
+		 * in order to support recovery feature we need to use
+		 * so called packed metadata. Packed metada will store
+		 * the same information as OOB metadata (l2p table mapping,
+		 * but in the form of the single page at the end of
+		 * every write request.
+		 */
+		if (pblk->min_write_pgs
+			* sizeof(struct pblk_sec_meta) > PAGE_SIZE) {
+			/* We want to keep all the packed metadata on single
+			 * page per write requests. So we need to ensure that
+			 * it will fit.
+			 *
+			 * This is more like sanity check, since there is
+			 * no device with such a big minimal write size
+			 * (above 1 metabytes).
+			 */
+			pblk_err(pblk, "Not supported min write size\n");
+			return -EINVAL;
+		}
+		/* For packed meta approach we do some simplification.
+		 * On read path we always issue requests which size
+		 * equal to max_write_pgs, with all pages filled with
+		 * user payload except of last one page which will be
+		 * filled with packed metadata.
+		 */
+		pblk->max_write_pgs = pblk->min_write_pgs;
+		pblk->min_write_pgs_data = pblk->min_write_pgs - 1;
 	}
 
 	pblk->pad_dist = kcalloc(pblk->min_write_pgs - 1, sizeof(atomic64_t),
@@ -641,7 +668,7 @@ static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct nvm_geo *geo = &dev->geo;
 	sector_t provisioned;
-	int sec_meta, blk_meta;
+	int sec_meta, blk_meta, clba;
 	int minimum;
 
 	if (geo->op == NVM_TARGET_DEFAULT_OP)
@@ -682,7 +709,8 @@ static int pblk_set_provision(struct pblk *pblk, int nr_free_chks)
 	sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
 	blk_meta = DIV_ROUND_UP(sec_meta, geo->clba);
 
-	pblk->capacity = (provisioned - blk_meta) * geo->clba;
+	clba = (geo->clba / pblk->min_write_pgs) * pblk->min_write_pgs_data;
+	pblk->capacity = (provisioned - blk_meta) * clba;
 
 	atomic_set(&pblk->rl.free_blocks, nr_free_chks);
 	atomic_set(&pblk->rl.free_user_blocks, nr_free_chks);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 81e503ec384e..79df583ea709 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -96,7 +96,7 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		 unsigned long *lun_bitmap, unsigned int valid_secs,
 		 unsigned int off)
 {
-	void *meta_list = rqd->meta_list;
+	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
 	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	unsigned int map_secs;
@@ -125,7 +125,7 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_line_meta *lm = &pblk->lm;
-	void *meta_list = rqd->meta_list;
+	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
 	void *meta_buffer;
 	struct ppa_addr *ppa_list = nvm_rq_to_ppa_list(rqd);
 	struct pblk_line *e_line, *d_line;
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 9f7fa0fe9c77..d4ca8c64ee0f 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -552,6 +552,9 @@ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd,
 		to_read = count;
 	}
 
+	/* Add space for packed metadata if in use*/
+	pad += (pblk->min_write_pgs - pblk->min_write_pgs_data);
+
 	c_ctx->sentry = pos;
 	c_ctx->nr_valid = to_read;
 	c_ctx->nr_padded = pad;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index 6becd85ca4c6..3789185144da 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -112,6 +112,9 @@ static void pblk_read_check_seq(struct pblk *pblk, struct nvm_rq *rqd,
 	int nr_lbas = rqd->nr_ppas;
 	int i;
 
+	if (!pblk_is_oob_meta_supported(pblk))
+		return;
+
 	for (i = 0; i < nr_lbas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		u64 lba = le64_to_cpu(meta->lba);
@@ -141,6 +144,9 @@ static void pblk_read_check_rand(struct pblk *pblk, struct nvm_rq *rqd,
 	void *meta_lba_list = rqd->meta_list;
 	int i, j;
 
+	if (!pblk_is_oob_meta_supported(pblk))
+		return;
+
 	for (i = 0, j = 0; i < nr_lbas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk,
 							   meta_lba_list, j);
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 3a775d10f616..3fcf062d752c 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -191,7 +191,7 @@ static int pblk_recov_pad_line(struct pblk *pblk, struct pblk_line *line,
 	kref_init(&pad_rq->ref);
 
 next_pad_rq:
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	if (rq_ppas < pblk->min_write_pgs) {
 		pblk_err(pblk, "corrupted pad line %d\n", line->id);
 		goto fail_free_pad;
@@ -371,17 +371,19 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line,
 next_rq:
 	memset(rqd, 0, pblk_g_rq_size);
 
-	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0);
+	rq_ppas = pblk_calc_secs(pblk, left_ppas, 0, false);
 	if (!rq_ppas)
 		rq_ppas = pblk->min_write_pgs;
 	rq_len = rq_ppas * geo->csecs;
 
+retry_rq:
 	bio = bio_map_kern(dev->q, data, rq_len, GFP_KERNEL);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_READ, 0);
+	bio_get(bio);
 
 	rqd->bio = bio;
 	rqd->opcode = NVM_OP_PREAD;
@@ -394,7 +396,6 @@ next_rq:
 	if (pblk_io_aligned(pblk, rq_ppas))
 		rqd->is_seq = 1;
 
-retry_rq:
 	for (i = 0; i < rqd->nr_ppas; ) {
 		struct ppa_addr ppa;
 		int pos;
@@ -417,6 +418,7 @@ retry_rq:
 	if (ret) {
 		pblk_err(pblk, "I/O submission failed: %d\n", ret);
 		bio_put(bio);
+		bio_put(bio);
 		return ret;
 	}
 
@@ -428,18 +430,25 @@ retry_rq:
 
 		if (padded) {
 			pblk_log_read_err(pblk, rqd);
+			bio_put(bio);
 			return -EINTR;
 		}
 
 		pad_distance = pblk_pad_distance(pblk, line);
 		ret = pblk_recov_pad_line(pblk, line, pad_distance);
-		if (ret)
+		if (ret) {
+			bio_put(bio);
 			return ret;
+		}
 
 		padded = true;
+		bio_put(bio);
 		goto retry_rq;
 	}
 
+	pblk_get_packed_meta(pblk, rqd);
+	bio_put(bio);
+
 	for (i = 0; i < rqd->nr_ppas; i++) {
 		struct pblk_sec_meta *meta = pblk_get_meta(pblk, meta_list, i);
 		u64 lba = le64_to_cpu(meta->lba);
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index 2d2818155aa8..7d8958df9472 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -479,6 +479,13 @@ static ssize_t pblk_sysfs_set_sec_per_write(struct pblk *pblk,
 	if (kstrtouint(page, 0, &sec_per_write))
 		return -EINVAL;
 
+	if (!pblk_is_oob_meta_supported(pblk)) {
+		/* For packed metadata case it is
+		 * not allowed to change sec_per_write.
+		 */
+		return -EINVAL;
+	}
+
 	if (sec_per_write < pblk->min_write_pgs
 				|| sec_per_write > pblk->max_write_pgs
 				|| sec_per_write % pblk->min_write_pgs != 0)
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 2bf78f81862d..06d56deb645d 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -348,7 +348,7 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail,
 {
 	int secs_to_sync;
 
-	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush);
+	secs_to_sync = pblk_calc_secs(pblk, secs_avail, secs_to_flush, true);
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
 	if ((!secs_to_sync && secs_to_flush)
@@ -569,7 +569,7 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 	struct bio *bio;
 	struct nvm_rq *rqd;
 	unsigned int secs_avail, secs_to_sync, secs_to_com;
-	unsigned int secs_to_flush;
+	unsigned int secs_to_flush, packed_meta_pgs;
 	unsigned long pos;
 	unsigned int resubmit;
 
@@ -607,7 +607,7 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 			return 0;
 
 		secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
-		if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
+		if (!secs_to_flush && secs_avail < pblk->min_write_pgs_data)
 			return 0;
 
 		secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail,
@@ -622,7 +622,8 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 		pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com);
 	}
 
-	bio = bio_alloc(GFP_KERNEL, secs_to_sync);
+	packed_meta_pgs = (pblk->min_write_pgs - pblk->min_write_pgs_data);
+	bio = bio_alloc(GFP_KERNEL, secs_to_sync + packed_meta_pgs);
 
 	bio->bi_iter.bi_sector = 0; /* internal bio */
 	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 9087d53d5c25..bc40b1381ff6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -632,6 +632,7 @@ struct pblk {
 	int state;			/* pblk line state */
 
 	int min_write_pgs; /* Minimum amount of pages required by controller */
+	int min_write_pgs_data; /* Minimum amount of payload pages */
 	int max_write_pgs; /* Maximum amount of pages supported by controller */
 	int oob_meta_size; /* Size of OOB sector metadata */
 
@@ -838,7 +839,7 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 u64 pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs);
 int pblk_calc_secs(struct pblk *pblk, unsigned long secs_avail,
-		   unsigned long secs_to_flush);
+		   unsigned long secs_to_flush, bool skip_meta);
 void pblk_down_rq(struct pblk *pblk, struct ppa_addr ppa,
 		  unsigned long *lun_bitmap);
 void pblk_down_chunk(struct pblk *pblk, struct ppa_addr ppa);
@@ -862,6 +863,8 @@ void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas,
 			  u64 *lba_list, int nr_secs);
 void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
 			 sector_t blba, int nr_secs);
+void *pblk_get_meta_for_writes(struct pblk *pblk, struct nvm_rq *rqd);
+void pblk_get_packed_meta(struct pblk *pblk, struct nvm_rq *rqd);
 
 /*
  * pblk user I/O write path
@@ -1392,4 +1395,9 @@ static inline int pblk_dma_meta_size(struct pblk *pblk)
 {
 	return pblk->oob_meta_size * NVM_MAX_VLBA;
 }
+
+static inline int pblk_is_oob_meta_supported(struct pblk *pblk)
+{
+	return pblk->oob_meta_size >= sizeof(struct pblk_sec_meta);
+}
 #endif /* PBLK_H_ */

From 2c4d5356e64d7d538f24c23045478330fae4a065 Mon Sep 17 00:00:00 2001
From: Igor Konopko <igor.j.konopko@intel.com>
Date: Tue, 11 Dec 2018 20:16:27 +0100
Subject: [PATCH 277/351] lightnvm: pblk: do not overwrite ppa list with meta
 list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ehen using pblk with 0 sized metadata both ppa list and meta list
points to the same memory since pblk_dma_meta_size() returns 0 in
that case.

This patch fix that issue by ensuring that pblk_dma_meta_size()
always returns space equal to sizeof(struct pblk_sec_meta) and thus
ppa list and meta list points to different memory address.

Even that in that case drive does not really care about meta_list
pointer, this is the easiest way to fix that issue without introducing
changes in many places in the code just for 0 sized metadata case.

The same approach needs to be also done for pblk_get_sec_meta()
since we also cannot point to the same memory address in meta buffer
when we are using it for pblk recovery process

Reported-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Tested-by: Hans Holmberg <hans.holmberg@cnexlabs.com>
Signed-off-by: Igor Konopko <igor.j.konopko@intel.com>
Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/lightnvm/pblk.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index bc40b1381ff6..85e38ed62f85 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -1388,12 +1388,15 @@ static inline unsigned int pblk_get_min_chks(struct pblk *pblk)
 static inline struct pblk_sec_meta *pblk_get_meta(struct pblk *pblk,
 							 void *meta, int index)
 {
-	return meta + pblk->oob_meta_size * index;
+	return meta +
+	       max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+	       * index;
 }
 
 static inline int pblk_dma_meta_size(struct pblk *pblk)
 {
-	return pblk->oob_meta_size * NVM_MAX_VLBA;
+	return max_t(int, sizeof(struct pblk_sec_meta), pblk->oob_meta_size)
+	       * NVM_MAX_VLBA;
 }
 
 static inline int pblk_is_oob_meta_supported(struct pblk *pblk)

From b2dbff1bb893d5dfdf501231ff5505ca10cdede3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 11 Dec 2018 18:39:41 -0700
Subject: [PATCH 278/351] sbitmap: flush deferred clears for resize and shallow
 gets

We're missing a deferred clear off the shallow get, which can cause
a hang. Additionally, when we resize the sbitmap, we should also
flush deferred clears for good measure.

Ensure we have full coverage on batch clears, even for paths where
we would not be doing deferred clear. This makes it less error
prone for future additions.

Reported-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 lib/sbitmap.c | 94 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 51 insertions(+), 43 deletions(-)

diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 2261136ae067..5b3e56d68dab 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -20,6 +20,47 @@
 #include <linux/sbitmap.h>
 #include <linux/seq_file.h>
 
+/*
+ * See if we have deferred clears that we can batch move
+ */
+static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
+{
+	unsigned long mask, val;
+	unsigned long __maybe_unused flags;
+	bool ret = false;
+
+	/* Silence bogus lockdep warning */
+#if defined(CONFIG_LOCKDEP)
+	local_irq_save(flags);
+#endif
+	spin_lock(&sb->map[index].swap_lock);
+
+	if (!sb->map[index].cleared)
+		goto out_unlock;
+
+	/*
+	 * First get a stable cleared mask, setting the old mask to 0.
+	 */
+	do {
+		mask = sb->map[index].cleared;
+	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
+
+	/*
+	 * Now clear the masked bits in our free word
+	 */
+	do {
+		val = sb->map[index].word;
+	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
+
+	ret = true;
+out_unlock:
+	spin_unlock(&sb->map[index].swap_lock);
+#if defined(CONFIG_LOCKDEP)
+	local_irq_restore(flags);
+#endif
+	return ret;
+}
+
 int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
 		      gfp_t flags, int node)
 {
@@ -70,6 +111,9 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth)
 	unsigned int bits_per_word = 1U << sb->shift;
 	unsigned int i;
 
+	for (i = 0; i < sb->map_nr; i++)
+		sbitmap_deferred_clear(sb, i);
+
 	sb->depth = depth;
 	sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word);
 
@@ -112,47 +156,6 @@ static int __sbitmap_get_word(unsigned long *word, unsigned long depth,
 	return nr;
 }
 
-/*
- * See if we have deferred clears that we can batch move
- */
-static inline bool sbitmap_deferred_clear(struct sbitmap *sb, int index)
-{
-	unsigned long mask, val;
-	unsigned long __maybe_unused flags;
-	bool ret = false;
-
-	/* Silence bogus lockdep warning */
-#if defined(CONFIG_LOCKDEP)
-	local_irq_save(flags);
-#endif
-	spin_lock(&sb->map[index].swap_lock);
-
-	if (!sb->map[index].cleared)
-		goto out_unlock;
-
-	/*
-	 * First get a stable cleared mask, setting the old mask to 0.
-	 */
-	do {
-		mask = sb->map[index].cleared;
-	} while (cmpxchg(&sb->map[index].cleared, mask, 0) != mask);
-
-	/*
-	 * Now clear the masked bits in our free word
-	 */
-	do {
-		val = sb->map[index].word;
-	} while (cmpxchg(&sb->map[index].word, val, val & ~mask) != val);
-
-	ret = true;
-out_unlock:
-	spin_unlock(&sb->map[index].swap_lock);
-#if defined(CONFIG_LOCKDEP)
-	local_irq_restore(flags);
-#endif
-	return ret;
-}
-
 static int sbitmap_find_bit_in_index(struct sbitmap *sb, int index,
 				     unsigned int alloc_hint, bool round_robin)
 {
@@ -215,6 +218,7 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 	index = SB_NR_TO_INDEX(sb, alloc_hint);
 
 	for (i = 0; i < sb->map_nr; i++) {
+again:
 		nr = __sbitmap_get_word(&sb->map[index].word,
 					min(sb->map[index].depth, shallow_depth),
 					SB_NR_TO_BIT(sb, alloc_hint), true);
@@ -223,6 +227,9 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 			break;
 		}
 
+		if (sbitmap_deferred_clear(sb, index))
+			goto again;
+
 		/* Jump to next index. */
 		index++;
 		alloc_hint = index << sb->shift;
@@ -242,7 +249,7 @@ bool sbitmap_any_bit_set(const struct sbitmap *sb)
 	unsigned int i;
 
 	for (i = 0; i < sb->map_nr; i++) {
-		if (sb->map[i].word)
+		if (sb->map[i].word & ~sb->map[i].cleared)
 			return true;
 	}
 	return false;
@@ -255,9 +262,10 @@ bool sbitmap_any_bit_clear(const struct sbitmap *sb)
 
 	for (i = 0; i < sb->map_nr; i++) {
 		const struct sbitmap_word *word = &sb->map[i];
+		unsigned long mask = word->word & ~word->cleared;
 		unsigned long ret;
 
-		ret = find_first_zero_bit(&word->word, word->depth);
+		ret = find_first_zero_bit(&mask, word->depth);
 		if (ret < word->depth)
 			return true;
 	}

From 544fbd16a461a318cd80537d1331c0df5c6cf930 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 12 Dec 2018 19:44:34 +0800
Subject: [PATCH 279/351] block: deactivate blk_stat timer in
 wbt_disable_default()

rwb_enabled() can't be changed when there is any inflight IO.

wbt_disable_default() may set rwb->wb_normal as zero, however the
blk_stat timer may still be pending, and the timer function will update
wrb->wb_normal again.

This patch introduces blk_stat_deactivate() and applies it in
wbt_disable_default(), then the following IO hang triggered when running
parted & switching io scheduler can be fixed:

[  369.937806] INFO: task parted:3645 blocked for more than 120 seconds.
[  369.938941]       Not tainted 4.20.0-rc6-00284-g906c801e5248 #498
[  369.939797] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  369.940768] parted          D    0  3645   3239 0x00000000
[  369.941500] Call Trace:
[  369.941874]  ? __schedule+0x6d9/0x74c
[  369.942392]  ? wbt_done+0x5e/0x5e
[  369.942864]  ? wbt_cleanup_cb+0x16/0x16
[  369.943404]  ? wbt_done+0x5e/0x5e
[  369.943874]  schedule+0x67/0x78
[  369.944298]  io_schedule+0x12/0x33
[  369.944771]  rq_qos_wait+0xb5/0x119
[  369.945193]  ? karma_partition+0x1c2/0x1c2
[  369.945691]  ? wbt_cleanup_cb+0x16/0x16
[  369.946151]  wbt_wait+0x85/0xb6
[  369.946540]  __rq_qos_throttle+0x23/0x2f
[  369.947014]  blk_mq_make_request+0xe6/0x40a
[  369.947518]  generic_make_request+0x192/0x2fe
[  369.948042]  ? submit_bio+0x103/0x11f
[  369.948486]  ? __radix_tree_lookup+0x35/0xb5
[  369.949011]  submit_bio+0x103/0x11f
[  369.949436]  ? blkg_lookup_slowpath+0x25/0x44
[  369.949962]  submit_bio_wait+0x53/0x7f
[  369.950469]  blkdev_issue_flush+0x8a/0xae
[  369.951032]  blkdev_fsync+0x2f/0x3a
[  369.951502]  do_fsync+0x2e/0x47
[  369.951887]  __x64_sys_fsync+0x10/0x13
[  369.952374]  do_syscall_64+0x89/0x149
[  369.952819]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[  369.953492] RIP: 0033:0x7f95a1e729d4
[  369.953996] Code: Bad RIP value.
[  369.954456] RSP: 002b:00007ffdb570dd48 EFLAGS: 00000246 ORIG_RAX: 000000000000004a
[  369.955506] RAX: ffffffffffffffda RBX: 000055c2139c6be0 RCX: 00007f95a1e729d4
[  369.956389] RDX: 0000000000000001 RSI: 0000000000001261 RDI: 0000000000000004
[  369.957325] RBP: 0000000000000002 R08: 0000000000000000 R09: 000055c2139c6ce0
[  369.958199] R10: 0000000000000000 R11: 0000000000000246 R12: 000055c2139c0380
[  369.959143] R13: 0000000000000004 R14: 0000000000000100 R15: 0000000000000008

Cc: stable@vger.kernel.org
Cc: Paolo Valente <paolo.valente@linaro.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-stat.h | 5 +++++
 block/blk-wbt.c  | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/blk-stat.h b/block/blk-stat.h
index f4a1568e81a4..17b47a86eefb 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb,
 	mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs));
 }
 
+static inline void blk_stat_deactivate(struct blk_stat_callback *cb)
+{
+	del_timer_sync(&cb->timer);
+}
+
 /**
  * blk_stat_activate_msecs() - Gather block statistics during a time window in
  * milliseconds.
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 40207edd1d89..463e4eb80287 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -708,8 +708,10 @@ void wbt_disable_default(struct request_queue *q)
 	if (!rqos)
 		return;
 	rwb = RQWB(rqos);
-	if (rwb->enable_state == WBT_STATE_ON_DEFAULT)
+	if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
+		blk_stat_deactivate(rwb->cb);
 		rwb->wb_normal = 0;
+	}
 }
 EXPORT_SYMBOL_GPL(wbt_disable_default);
 

From 0273ac349f08f4ff9ef88aaaf9c9f2aa6e87d2be Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Tue, 11 Dec 2018 18:03:08 -0500
Subject: [PATCH 280/351] blkcg: handle dying request_queue when associating a
 blkg

Between v3 [1] and v4 [2] of the blkg association series, the
association point moved from generic_make_request_checks(), which is
called after the request enters the queue, to bio_set_dev(), which is when
the bio is formed before submit_bio(). When the request_queue goes away,
the blkgs supporting the request_queue are destroyed and then the
q->root_blkg is set to %NULL.

This patch adds a %NULL check to blkg_tryget_closest() to prevent the
NPE caused by the above. It also adds a guard to see if the
request_queue is dying when creating a blkg to prevent creating a blkg
for a dead request_queue.

[1] https://lore.kernel.org/lkml/20180911184137.35897-1-dennisszhou@gmail.com/
[2] https://lore.kernel.org/lkml/20181126211946.77067-1-dennis@kernel.org/

Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device")
Reported-and-tested-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-cgroup.c         | 6 ++++++
 include/linux/blk-cgroup.h | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 6bd0619a7d6e..c30661ddc873 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -202,6 +202,12 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 	WARN_ON_ONCE(!rcu_read_lock_held());
 	lockdep_assert_held(&q->queue_lock);
 
+	/* request_queue is dying, do not create/recreate a blkg */
+	if (blk_queue_dying(q)) {
+		ret = -ENODEV;
+		goto err_free_blkg;
+	}
+
 	/* blkg holds a reference to blkcg */
 	if (!css_tryget_online(&blkcg->css)) {
 		ret = -ENODEV;
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index bf13ecb0fe4f..f025fd1e22e6 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -511,7 +511,7 @@ static inline bool blkg_tryget(struct blkcg_gq *blkg)
  */
 static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
 {
-	while (!percpu_ref_tryget(&blkg->refcnt))
+	while (blkg && !percpu_ref_tryget(&blkg->refcnt))
 		blkg = blkg->parent;
 
 	return blkg;

From 3152a974678a1e80c3c16d4b86522ecc500be529 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:05 -0800
Subject: [PATCH 281/351] ath6kl: add ath6kl_ prefix to crypto_type

Prevent a namespace conflict as in following patches as skbuff.h will
include the crypto API.

Acked-by: David S. Miller <davem@davemloft.net>
Cc: Kalle Valo <kvalo@codeaurora.org>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/net/wireless/ath/ath6kl/cfg80211.c | 2 +-
 drivers/net/wireless/ath/ath6kl/common.h   | 2 +-
 drivers/net/wireless/ath/ath6kl/wmi.c      | 6 +++---
 drivers/net/wireless/ath/ath6kl/wmi.h      | 6 +++---
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c
index e121187f371f..fa049c4ae315 100644
--- a/drivers/net/wireless/ath/ath6kl/cfg80211.c
+++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c
@@ -1322,7 +1322,7 @@ static int ath6kl_cfg80211_set_default_key(struct wiphy *wiphy,
 	struct ath6kl_vif *vif = netdev_priv(ndev);
 	struct ath6kl_key *key = NULL;
 	u8 key_usage;
-	enum crypto_type key_type = NONE_CRYPT;
+	enum ath6kl_crypto_type key_type = NONE_CRYPT;
 
 	ath6kl_dbg(ATH6KL_DBG_WLAN_CFG, "%s: index %d\n", __func__, key_index);
 
diff --git a/drivers/net/wireless/ath/ath6kl/common.h b/drivers/net/wireless/ath/ath6kl/common.h
index 4f82e8632d37..d6e5234f67a1 100644
--- a/drivers/net/wireless/ath/ath6kl/common.h
+++ b/drivers/net/wireless/ath/ath6kl/common.h
@@ -67,7 +67,7 @@ struct ath6kl_llc_snap_hdr {
 	__be16 eth_type;
 } __packed;
 
-enum crypto_type {
+enum ath6kl_crypto_type {
 	NONE_CRYPT          = 0x01,
 	WEP_CRYPT           = 0x02,
 	TKIP_CRYPT          = 0x04,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.c b/drivers/net/wireless/ath/ath6kl/wmi.c
index 777acc564ac9..9d7ac1ab2d02 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.c
+++ b/drivers/net/wireless/ath/ath6kl/wmi.c
@@ -1849,9 +1849,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
 			   enum network_type nw_type,
 			   enum dot11_auth_mode dot11_auth_mode,
 			   enum auth_mode auth_mode,
-			   enum crypto_type pairwise_crypto,
+			   enum ath6kl_crypto_type pairwise_crypto,
 			   u8 pairwise_crypto_len,
-			   enum crypto_type group_crypto,
+			   enum ath6kl_crypto_type group_crypto,
 			   u8 group_crypto_len, int ssid_len, u8 *ssid,
 			   u8 *bssid, u16 channel, u32 ctrl_flags,
 			   u8 nw_subtype)
@@ -2301,7 +2301,7 @@ int ath6kl_wmi_disctimeout_cmd(struct wmi *wmi, u8 if_idx, u8 timeout)
 }
 
 int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
-			  enum crypto_type key_type,
+			  enum ath6kl_crypto_type key_type,
 			  u8 key_usage, u8 key_len,
 			  u8 *key_rsc, unsigned int key_rsc_len,
 			  u8 *key_material,
diff --git a/drivers/net/wireless/ath/ath6kl/wmi.h b/drivers/net/wireless/ath/ath6kl/wmi.h
index a60bb49fe920..784940ba4c90 100644
--- a/drivers/net/wireless/ath/ath6kl/wmi.h
+++ b/drivers/net/wireless/ath/ath6kl/wmi.h
@@ -2556,9 +2556,9 @@ int ath6kl_wmi_connect_cmd(struct wmi *wmi, u8 if_idx,
 			   enum network_type nw_type,
 			   enum dot11_auth_mode dot11_auth_mode,
 			   enum auth_mode auth_mode,
-			   enum crypto_type pairwise_crypto,
+			   enum ath6kl_crypto_type pairwise_crypto,
 			   u8 pairwise_crypto_len,
-			   enum crypto_type group_crypto,
+			   enum ath6kl_crypto_type group_crypto,
 			   u8 group_crypto_len, int ssid_len, u8 *ssid,
 			   u8 *bssid, u16 channel, u32 ctrl_flags,
 			   u8 nw_subtype);
@@ -2610,7 +2610,7 @@ int ath6kl_wmi_config_debug_module_cmd(struct wmi *wmi, u32 valid, u32 config);
 
 int ath6kl_wmi_get_stats_cmd(struct wmi *wmi, u8 if_idx);
 int ath6kl_wmi_addkey_cmd(struct wmi *wmi, u8 if_idx, u8 key_index,
-			  enum crypto_type key_type,
+			  enum ath6kl_crypto_type key_type,
 			  u8 key_usage, u8 key_len,
 			  u8 *key_rsc, unsigned int key_rsc_len,
 			  u8 *key_material,

From 0fc07791bc775478d9450ca9c6c674b45f6c1998 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:06 -0800
Subject: [PATCH 282/351] datagram: open-code copy_page_to_iter

This will be useful to consolidate skb_copy_and_hash_datagram_iter and
skb_copy_and_csum_datagram to a single code path.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 net/core/datagram.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index 57f3a6fcfc1e..abe642181b64 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -445,11 +445,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 
 		end = start + skb_frag_size(frag);
 		if ((copy = end - offset) > 0) {
+			struct page *page = skb_frag_page(frag);
+			u8 *vaddr = kmap(page);
+
 			if (copy > len)
 				copy = len;
-			n = copy_page_to_iter(skb_frag_page(frag),
-					      frag->page_offset + offset -
-					      start, copy, to);
+			n = copy_to_iter(vaddr + frag->page_offset +
+					 offset - start, copy, to);
+			kunmap(page);
 			offset += n;
 			if (n != copy)
 				goto short_copy;

From cb002d074dabfaa2248507fd9478d16a542e4f1e Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:07 -0800
Subject: [PATCH 283/351] iov_iter: pass void csum pointer to
 csum_and_copy_to_iter

The single caller to csum_and_copy_to_iter is skb_copy_and_csum_datagram
and we are trying to unite its logic with skb_copy_datagram_iter by passing
a callback to the copy function that we want to apply. Thus, we need
to make the checksum pointer private to the function.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/uio.h | 2 +-
 lib/iov_iter.c      | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 55ce99ddb912..41d1f8d3313d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -266,7 +266,7 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 {
 	i->count = count;
 }
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 54c248526b55..63a8999a234d 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1464,10 +1464,11 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum,
 }
 EXPORT_SYMBOL(csum_and_copy_from_iter_full);
 
-size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
+size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
 			     struct iov_iter *i)
 {
 	const char *from = addr;
+	__wsum *csum = csump;
 	__wsum sum, next;
 	size_t off = 0;
 

From 950fcaecd5cc6c014bb96506fd0652a501c85276 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:08 -0800
Subject: [PATCH 284/351] datagram: consolidate datagram copy to iter helpers

skb_copy_datagram_iter and skb_copy_and_csum_datagram are essentialy
the same but with a couple of differences: The first is the copy
operation used which either a simple copy or a csum_and_copy, and the
second are the behavior on the "short copy" path where simply copy
needs to return the number of bytes successfully copied while csum_and_copy
needs to fault immediately as the checksum is partial.

Introduce __skb_datagram_iter that additionally accepts:
1. copy operation function pointer
2. private data that goes with the copy operation
3. fault_short flag to indicate the action on short copy

Suggested-by: David S. Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 net/core/datagram.c | 136 ++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 94 deletions(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index abe642181b64..382543302ae5 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 }
 EXPORT_SYMBOL(skb_kill_datagram);
 
-/**
- *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: iovec iterator to copy to
- *	@len: amount of data to copy from buffer to iovec
- */
-int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
-			   struct iov_iter *to, int len)
+int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+			struct iov_iter *to, int len, bool fault_short,
+			size_t (*cb)(const void *, size_t, void *, struct iov_iter *),
+			void *data)
 {
 	int start = skb_headlen(skb);
 	int i, copy = start - offset, start_off = offset, n;
 	struct sk_buff *frag_iter;
 
-	trace_skb_copy_datagram_iovec(skb, len);
-
 	/* Copy header. */
 	if (copy > 0) {
 		if (copy > len)
 			copy = len;
-		n = copy_to_iter(skb->data + offset, copy, to);
+		n = cb(skb->data + offset, copy, data, to);
 		offset += n;
 		if (n != copy)
 			goto short_copy;
@@ -450,8 +443,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 
 			if (copy > len)
 				copy = len;
-			n = copy_to_iter(vaddr + frag->page_offset +
-					 offset - start, copy, to);
+			n = cb(vaddr + frag->page_offset +
+				offset - start, copy, data, to);
 			kunmap(page);
 			offset += n;
 			if (n != copy)
@@ -471,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
 		if ((copy = end - offset) > 0) {
 			if (copy > len)
 				copy = len;
-			if (skb_copy_datagram_iter(frag_iter, offset - start,
-						   to, copy))
+			if (__skb_datagram_iter(frag_iter, offset - start,
+						to, copy, short_copy, cb, data))
 				goto fault;
 			if ((len -= copy) == 0)
 				return 0;
@@ -493,11 +486,32 @@ fault:
 	return -EFAULT;
 
 short_copy:
-	if (iov_iter_count(to))
+	if (fault_short || iov_iter_count(to))
 		goto fault;
 
 	return 0;
 }
+
+static size_t simple_copy_to_iter(const void *addr, size_t bytes,
+		void *data __always_unused, struct iov_iter *i)
+{
+	return copy_to_iter(addr, bytes, i);
+}
+
+/**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	trace_skb_copy_datagram_iovec(skb, len);
+	return __skb_datagram_iter(skb, offset, to, len, false,
+			simple_copy_to_iter, NULL);
+}
 EXPORT_SYMBOL(skb_copy_datagram_iter);
 
 /**
@@ -648,87 +662,21 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
 }
 EXPORT_SYMBOL(zerocopy_sg_from_iter);
 
+/**
+ *	skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator
+ *          and update a checksum.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ *      @csump: checksum pointer
+ */
 static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
 				      struct iov_iter *to, int len,
 				      __wsum *csump)
 {
-	int start = skb_headlen(skb);
-	int i, copy = start - offset, start_off = offset;
-	struct sk_buff *frag_iter;
-	int pos = 0;
-	int n;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
-		offset += n;
-		if (n != copy)
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		pos = copy;
-	}
-
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			__wsum csum2 = 0;
-			struct page *page = skb_frag_page(frag);
-			u8  *vaddr = kmap(page);
-
-			if (copy > len)
-				copy = len;
-			n = csum_and_copy_to_iter(vaddr + frag->page_offset +
-						  offset - start, copy,
-						  &csum2, to);
-			kunmap(page);
-			offset += n;
-			if (n != copy)
-				goto fault;
-			*csump = csum_block_add(*csump, csum2, pos);
-			if (!(len -= copy))
-				return 0;
-			pos += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			__wsum csum2 = 0;
-			if (copy > len)
-				copy = len;
-			if (skb_copy_and_csum_datagram(frag_iter,
-						       offset - start,
-						       to, copy,
-						       &csum2))
-				goto fault;
-			*csump = csum_block_add(*csump, csum2, pos);
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			pos += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	iov_iter_revert(to, offset - start_off);
-	return -EFAULT;
+	return __skb_datagram_iter(skb, offset, to, len, true,
+			csum_and_copy_to_iter, csump);
 }
 
 __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)

From d05f443554b3c7dc6d46e3ba9c3c4de468875d4f Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:09 -0800
Subject: [PATCH 285/351] iov_iter: introduce hash_and_copy_to_iter helper

Allow consumers that want to use iov iterator helpers and also update
a predefined hash calculation online when copying data. This is useful
when copying incoming network buffers to a local iterator and calculate
a digest on the incoming stream. nvme-tcp host driver that will be
introduced in following patches is the first consumer via
skb_copy_and_hash_datagram_iter.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/uio.h |  3 +++
 lib/iov_iter.c      | 16 ++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 41d1f8d3313d..ecf584f6b82d 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -11,6 +11,7 @@
 
 #include <linux/kernel.h>
 #include <linux/thread_info.h>
+#include <crypto/hash.h>
 #include <uapi/linux/uio.h>
 
 struct page;
@@ -269,6 +270,8 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
 size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump, struct iov_iter *i);
 size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
 bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i);
+size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+		struct iov_iter *i);
 
 int import_iovec(int type, const struct iovec __user * uvector,
 		 unsigned nr_segs, unsigned fast_segs,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 63a8999a234d..1928009f506e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -6,6 +6,7 @@
 #include <linux/vmalloc.h>
 #include <linux/splice.h>
 #include <net/checksum.h>
+#include <linux/scatterlist.h>
 
 #define PIPE_PARANOIA /* for now */
 
@@ -1511,6 +1512,21 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csump,
 }
 EXPORT_SYMBOL(csum_and_copy_to_iter);
 
+size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
+		struct iov_iter *i)
+{
+	struct ahash_request *hash = hashp;
+	struct scatterlist sg;
+	size_t copied;
+
+	copied = copy_to_iter(addr, bytes, i);
+	sg_init_one(&sg, addr, copied);
+	ahash_request_set_crypt(hash, &sg, NULL, copied);
+	crypto_ahash_update(hash);
+	return copied;
+}
+EXPORT_SYMBOL(hash_and_copy_to_iter);
+
 int iov_iter_npages(const struct iov_iter *i, int maxpages)
 {
 	size_t size = i->count;

From 65d69e2505bb64f6a8d7f417f6e46e2a351174c6 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:10 -0800
Subject: [PATCH 286/351] datagram: introduce skb_copy_and_hash_datagram_iter
 helper

Introduce a helper to copy datagram into an iovec iterator
but also update a predefined hash. This is useful for
consumers of skb_copy_datagram_iter to also support inflight
data digest without having to finish to copy and only then
traverse the iovec and calculate the digest hash.

Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/skbuff.h |  3 +++
 net/core/datagram.c    | 20 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0d1b2c3f127b..b96c809c29eb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3325,6 +3325,9 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
 }
 int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
 				   struct msghdr *msg);
+int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len,
+			   struct ahash_request *hash);
 int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
 				 struct iov_iter *from, int len);
 int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 382543302ae5..ef262282c8be 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -465,7 +465,7 @@ int __skb_datagram_iter(const struct sk_buff *skb, int offset,
 			if (copy > len)
 				copy = len;
 			if (__skb_datagram_iter(frag_iter, offset - start,
-						to, copy, short_copy, cb, data))
+						to, copy, fault_short, cb, data))
 				goto fault;
 			if ((len -= copy) == 0)
 				return 0;
@@ -492,6 +492,24 @@ short_copy:
 	return 0;
 }
 
+/**
+ *	skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
+ *          and update a hash.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ *      @hash: hash request to update
+ */
+int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len,
+			   struct ahash_request *hash)
+{
+	return __skb_datagram_iter(skb, offset, to, len, true,
+			hash_and_copy_to_iter, hash);
+}
+EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
+
 static size_t simple_copy_to_iter(const void *addr, size_t bytes,
 		void *data __always_unused, struct iov_iter *i)
 {

From 1672ddb8d691e4433806373ec4104f37a86efab0 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:11 -0800
Subject: [PATCH 287/351] nvmet: Add install_queue callout

nvmet-tcp will implement it to allocate queue commands which
are only known at nvmf connect time (sq size).

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/fabrics-cmd.c | 10 ++++++++++
 drivers/nvme/target/nvmet.h       |  1 +
 2 files changed, 11 insertions(+)

diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 328ae46d8344..ee7d84621d65 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -121,6 +121,16 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 		req->rsp->sq_head = cpu_to_le16(0xffff);
 	}
 
+	if (ctrl->ops->install_queue) {
+		u16 ret = ctrl->ops->install_queue(req->sq);
+
+		if (ret) {
+			pr_err("failed to install queue %d cntlid %d ret %x\n",
+				qid, ret, ctrl->cntlid);
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 7d8b7a7d572a..89df51ee5bdf 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -279,6 +279,7 @@ struct nvmet_fabrics_ops {
 	void (*delete_ctrl)(struct nvmet_ctrl *ctrl);
 	void (*disc_traddr)(struct nvmet_req *req,
 			struct nvmet_port *port, char *traddr);
+	u16 (*install_queue)(struct nvmet_sq *nvme_sq);
 };
 
 #define NVMET_MAX_INLINE_BIOVEC	8

From 3b49fa807284ec30669a95fd5f3806e127d29f4d Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:12 -0800
Subject: [PATCH 288/351] nvme-fabrics: allow user passing header digest

Header digest is a nvme-tcp specific feature, but nothing prevents other
transports reusing the concept so do not associate with tcp transport
solely.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 5 +++++
 drivers/nvme/host/fabrics.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 10074ac7731b..4272f8a95db3 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -614,6 +614,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_HOST_ID,		"hostid=%s"		},
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
+	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -633,6 +634,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY;
 	opts->kato = NVME_DEFAULT_KATO;
 	opts->duplicate_connect = false;
+	opts->hdr_digest = false;
 
 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
@@ -827,6 +829,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DISABLE_SQFLOW:
 			opts->disable_sqflow = true;
 			break;
+		case NVMF_OPT_HDR_DIGEST:
+			opts->hdr_digest = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index ecd9a006a091..a6127f1a9e8e 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -59,6 +59,7 @@ enum {
 	NVMF_OPT_HOST_ID	= 1 << 12,
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
+	NVMF_OPT_HDR_DIGEST	= 1 << 15,
 };
 
 /**
@@ -103,6 +104,7 @@ struct nvmf_ctrl_options {
 	struct nvmf_host	*host;
 	int			max_reconnects;
 	bool			disable_sqflow;
+	bool			hdr_digest;
 };
 
 /*

From 20d44e86321299f61bb782a39aaa30f579823f58 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:13 -0800
Subject: [PATCH 289/351] nvme-fabrics: allow user passing data digest

Data digest is a nvme-tcp specific feature, but nothing prevents other
transports reusing the concept so do not associate with tcp transport
solely.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 5 +++++
 drivers/nvme/host/fabrics.h | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 4272f8a95db3..9c62c6838b76 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -615,6 +615,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_DUP_CONNECT,		"duplicate_connect"	},
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
+	{ NVMF_OPT_DATA_DIGEST,		"data_digest"		},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -635,6 +636,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 	opts->kato = NVME_DEFAULT_KATO;
 	opts->duplicate_connect = false;
 	opts->hdr_digest = false;
+	opts->data_digest = false;
 
 	options = o = kstrdup(buf, GFP_KERNEL);
 	if (!options)
@@ -832,6 +834,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_HDR_DIGEST:
 			opts->hdr_digest = true;
 			break;
+		case NVMF_OPT_DATA_DIGEST:
+			opts->data_digest = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a6127f1a9e8e..524a02a67817 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -60,6 +60,7 @@ enum {
 	NVMF_OPT_DUP_CONNECT	= 1 << 13,
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 	NVMF_OPT_HDR_DIGEST	= 1 << 15,
+	NVMF_OPT_DATA_DIGEST	= 1 << 16,
 };
 
 /**
@@ -105,6 +106,7 @@ struct nvmf_ctrl_options {
 	int			max_reconnects;
 	bool			disable_sqflow;
 	bool			hdr_digest;
+	bool			data_digest;
 };
 
 /*

From fc221d05447aa6db686a6724dd08aa6cce0924d1 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:14 -0800
Subject: [PATCH 290/351] nvme-tcp: Add protocol header

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme-tcp.h | 189 +++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h     |   1 +
 2 files changed, 190 insertions(+)
 create mode 100644 include/linux/nvme-tcp.h

diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h
new file mode 100644
index 000000000000..03d87c0550a9
--- /dev/null
+++ b/include/linux/nvme-tcp.h
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP protocol header.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+
+#ifndef _LINUX_NVME_TCP_H
+#define _LINUX_NVME_TCP_H
+
+#include <linux/nvme.h>
+
+#define NVME_TCP_DISC_PORT	8009
+#define NVME_TCP_ADMIN_CCSZ	SZ_8K
+#define NVME_TCP_DIGEST_LENGTH	4
+
+enum nvme_tcp_pfv {
+	NVME_TCP_PFV_1_0 = 0x0,
+};
+
+enum nvme_tcp_fatal_error_status {
+	NVME_TCP_FES_INVALID_PDU_HDR		= 0x01,
+	NVME_TCP_FES_PDU_SEQ_ERR		= 0x02,
+	NVME_TCP_FES_HDR_DIGEST_ERR		= 0x03,
+	NVME_TCP_FES_DATA_OUT_OF_RANGE		= 0x04,
+	NVME_TCP_FES_R2T_LIMIT_EXCEEDED		= 0x05,
+	NVME_TCP_FES_DATA_LIMIT_EXCEEDED	= 0x05,
+	NVME_TCP_FES_UNSUPPORTED_PARAM		= 0x06,
+};
+
+enum nvme_tcp_digest_option {
+	NVME_TCP_HDR_DIGEST_ENABLE	= (1 << 0),
+	NVME_TCP_DATA_DIGEST_ENABLE	= (1 << 1),
+};
+
+enum nvme_tcp_pdu_type {
+	nvme_tcp_icreq		= 0x0,
+	nvme_tcp_icresp		= 0x1,
+	nvme_tcp_h2c_term	= 0x2,
+	nvme_tcp_c2h_term	= 0x3,
+	nvme_tcp_cmd		= 0x4,
+	nvme_tcp_rsp		= 0x5,
+	nvme_tcp_h2c_data	= 0x6,
+	nvme_tcp_c2h_data	= 0x7,
+	nvme_tcp_r2t		= 0x9,
+};
+
+enum nvme_tcp_pdu_flags {
+	NVME_TCP_F_HDGST		= (1 << 0),
+	NVME_TCP_F_DDGST		= (1 << 1),
+	NVME_TCP_F_DATA_LAST		= (1 << 2),
+	NVME_TCP_F_DATA_SUCCESS		= (1 << 3),
+};
+
+/**
+ * struct nvme_tcp_hdr - nvme tcp pdu common header
+ *
+ * @type:          pdu type
+ * @flags:         pdu specific flags
+ * @hlen:          pdu header length
+ * @pdo:           pdu data offset
+ * @plen:          pdu wire byte length
+ */
+struct nvme_tcp_hdr {
+	__u8	type;
+	__u8	flags;
+	__u8	hlen;
+	__u8	pdo;
+	__le32	plen;
+};
+
+/**
+ * struct nvme_tcp_icreq_pdu - nvme tcp initialize connection request pdu
+ *
+ * @hdr:           pdu generic header
+ * @pfv:           pdu version format
+ * @hpda:          host pdu data alignment (dwords, 0's based)
+ * @digest:        digest types enabled
+ * @maxr2t:        maximum r2ts per request supported
+ */
+struct nvme_tcp_icreq_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			pfv;
+	__u8			hpda;
+	__u8			digest;
+	__le32			maxr2t;
+	__u8			rsvd2[112];
+};
+
+/**
+ * struct nvme_tcp_icresp_pdu - nvme tcp initialize connection response pdu
+ *
+ * @hdr:           pdu common header
+ * @pfv:           pdu version format
+ * @cpda:          controller pdu data alignment (dowrds, 0's based)
+ * @digest:        digest types enabled
+ * @maxdata:       maximum data capsules per r2t supported
+ */
+struct nvme_tcp_icresp_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			pfv;
+	__u8			cpda;
+	__u8			digest;
+	__le32			maxdata;
+	__u8			rsvd[112];
+};
+
+/**
+ * struct nvme_tcp_term_pdu - nvme tcp terminate connection pdu
+ *
+ * @hdr:           pdu common header
+ * @fes:           fatal error status
+ * @fei:           fatal error information
+ */
+struct nvme_tcp_term_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__le16			fes;
+	__le32			fei;
+	__u8			rsvd[8];
+};
+
+/**
+ * struct nvme_tcp_cmd_pdu - nvme tcp command capsule pdu
+ *
+ * @hdr:           pdu common header
+ * @cmd:           nvme command
+ */
+struct nvme_tcp_cmd_pdu {
+	struct nvme_tcp_hdr	hdr;
+	struct nvme_command	cmd;
+};
+
+/**
+ * struct nvme_tcp_rsp_pdu - nvme tcp response capsule pdu
+ *
+ * @hdr:           pdu common header
+ * @hdr:           nvme-tcp generic header
+ * @cqe:           nvme completion queue entry
+ */
+struct nvme_tcp_rsp_pdu {
+	struct nvme_tcp_hdr	hdr;
+	struct nvme_completion	cqe;
+};
+
+/**
+ * struct nvme_tcp_r2t_pdu - nvme tcp ready-to-transfer pdu
+ *
+ * @hdr:           pdu common header
+ * @command_id:    nvme command identifier which this relates to
+ * @ttag:          transfer tag (controller generated)
+ * @r2t_offset:    offset from the start of the command data
+ * @r2t_length:    length the host is allowed to send
+ */
+struct nvme_tcp_r2t_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__u16			command_id;
+	__u16			ttag;
+	__le32			r2t_offset;
+	__le32			r2t_length;
+	__u8			rsvd[4];
+};
+
+/**
+ * struct nvme_tcp_data_pdu - nvme tcp data pdu
+ *
+ * @hdr:           pdu common header
+ * @command_id:    nvme command identifier which this relates to
+ * @ttag:          transfer tag (controller generated)
+ * @data_offset:   offset from the start of the command data
+ * @data_length:   length of the data stream
+ */
+struct nvme_tcp_data_pdu {
+	struct nvme_tcp_hdr	hdr;
+	__u16			command_id;
+	__u16			ttag;
+	__le32			data_offset;
+	__le32			data_length;
+	__u8			rsvd[4];
+};
+
+union nvme_tcp_pdu {
+	struct nvme_tcp_icreq_pdu	icreq;
+	struct nvme_tcp_icresp_pdu	icresp;
+	struct nvme_tcp_cmd_pdu		cmd;
+	struct nvme_tcp_rsp_pdu		rsp;
+	struct nvme_tcp_r2t_pdu		r2t;
+	struct nvme_tcp_data_pdu	data;
+};
+
+#endif /* _LINUX_NVME_TCP_H */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 88812cb15be0..4d7907e3771e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -52,6 +52,7 @@ enum {
 enum {
 	NVMF_TRTYPE_RDMA	= 1,	/* RDMA */
 	NVMF_TRTYPE_FC		= 2,	/* Fibre Channel */
+	NVMF_TRTYPE_TCP		= 3,	/* TCP/IP */
 	NVMF_TRTYPE_LOOP	= 254,	/* Reserved for host usage */
 	NVMF_TRTYPE_MAX,
 };

From 872d26a391da92ed8f0c0f5cb5fef428067b7f30 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:15 -0800
Subject: [PATCH 291/351] nvmet-tcp: add NVMe over TCP target driver

This patch implements the TCP transport driver for the NVMe over Fabrics
target stack. This allows exporting NVMe over Fabrics functionality over
good old TCP/IP.

The driver implements the TP 8000 of how nvme over fabrics capsules and
data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte
stream. nvme-tcp header and data digest are supported as well.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/Kconfig  |   10 +
 drivers/nvme/target/Makefile |    2 +
 drivers/nvme/target/tcp.c    | 1737 ++++++++++++++++++++++++++++++++++
 3 files changed, 1749 insertions(+)
 create mode 100644 drivers/nvme/target/tcp.c

diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 3c7b61ddb0d1..d94f25cde019 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -60,3 +60,13 @@ config NVME_TARGET_FCLOOP
 	  to test NVMe-FC transport interfaces.
 
 	  If unsure, say N.
+
+config NVME_TARGET_TCP
+	tristate "NVMe over Fabrics TCP target support"
+	depends on INET
+	depends on NVME_TARGET
+	help
+	  This enables the NVMe TCP target support, which allows exporting NVMe
+	  devices over TCP.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index 8118c93391c6..8c3ad0fb6860 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_TARGET_LOOP)		+= nvme-loop.o
 obj-$(CONFIG_NVME_TARGET_RDMA)		+= nvmet-rdma.o
 obj-$(CONFIG_NVME_TARGET_FC)		+= nvmet-fc.o
 obj-$(CONFIG_NVME_TARGET_FCLOOP)	+= nvme-fcloop.o
+obj-$(CONFIG_NVME_TARGET_TCP)		+= nvmet-tcp.o
 
 nvmet-y		+= core.o configfs.o admin-cmd.o fabrics-cmd.o \
 			discovery.o io-cmd-file.o io-cmd-bdev.o
@@ -12,3 +13,4 @@ nvme-loop-y	+= loop.o
 nvmet-rdma-y	+= rdma.o
 nvmet-fc-y	+= fc.o
 nvme-fcloop-y	+= fcloop.o
+nvmet-tcp-y	+= tcp.o
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
new file mode 100644
index 000000000000..d31bec260160
--- /dev/null
+++ b/drivers/nvme/target/tcp.c
@@ -0,0 +1,1737 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP target.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/inet.h>
+#include <linux/llist.h>
+#include <crypto/hash.h>
+
+#include "nvmet.h"
+
+#define NVMET_TCP_DEF_INLINE_DATA_SIZE	(4 * PAGE_SIZE)
+
+#define NVMET_TCP_RECV_BUDGET		8
+#define NVMET_TCP_SEND_BUDGET		8
+#define NVMET_TCP_IO_WORK_BUDGET	64
+
+enum nvmet_tcp_send_state {
+	NVMET_TCP_SEND_DATA_PDU,
+	NVMET_TCP_SEND_DATA,
+	NVMET_TCP_SEND_R2T,
+	NVMET_TCP_SEND_DDGST,
+	NVMET_TCP_SEND_RESPONSE
+};
+
+enum nvmet_tcp_recv_state {
+	NVMET_TCP_RECV_PDU,
+	NVMET_TCP_RECV_DATA,
+	NVMET_TCP_RECV_DDGST,
+	NVMET_TCP_RECV_ERR,
+};
+
+enum {
+	NVMET_TCP_F_INIT_FAILED = (1 << 0),
+};
+
+struct nvmet_tcp_cmd {
+	struct nvmet_tcp_queue		*queue;
+	struct nvmet_req		req;
+
+	struct nvme_tcp_cmd_pdu		*cmd_pdu;
+	struct nvme_tcp_rsp_pdu		*rsp_pdu;
+	struct nvme_tcp_data_pdu	*data_pdu;
+	struct nvme_tcp_r2t_pdu		*r2t_pdu;
+
+	u32				rbytes_done;
+	u32				wbytes_done;
+
+	u32				pdu_len;
+	u32				pdu_recv;
+	int				sg_idx;
+	int				nr_mapped;
+	struct msghdr			recv_msg;
+	struct kvec			*iov;
+	u32				flags;
+
+	struct list_head		entry;
+	struct llist_node		lentry;
+
+	/* send state */
+	u32				offset;
+	struct scatterlist		*cur_sg;
+	enum nvmet_tcp_send_state	state;
+
+	__le32				exp_ddgst;
+	__le32				recv_ddgst;
+};
+
+enum nvmet_tcp_queue_state {
+	NVMET_TCP_Q_CONNECTING,
+	NVMET_TCP_Q_LIVE,
+	NVMET_TCP_Q_DISCONNECTING,
+};
+
+struct nvmet_tcp_queue {
+	struct socket		*sock;
+	struct nvmet_tcp_port	*port;
+	struct work_struct	io_work;
+	int			cpu;
+	struct nvmet_cq		nvme_cq;
+	struct nvmet_sq		nvme_sq;
+
+	/* send state */
+	struct nvmet_tcp_cmd	*cmds;
+	unsigned int		nr_cmds;
+	struct list_head	free_list;
+	struct llist_head	resp_list;
+	struct list_head	resp_send_list;
+	int			send_list_len;
+	struct nvmet_tcp_cmd	*snd_cmd;
+
+	/* recv state */
+	int			offset;
+	int			left;
+	enum nvmet_tcp_recv_state rcv_state;
+	struct nvmet_tcp_cmd	*cmd;
+	union nvme_tcp_pdu	pdu;
+
+	/* digest state */
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*snd_hash;
+	struct ahash_request	*rcv_hash;
+
+	spinlock_t		state_lock;
+	enum nvmet_tcp_queue_state state;
+
+	struct sockaddr_storage	sockaddr;
+	struct sockaddr_storage	sockaddr_peer;
+	struct work_struct	release_work;
+
+	int			idx;
+	struct list_head	queue_list;
+
+	struct nvmet_tcp_cmd	connect;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*data_ready)(struct sock *);
+	void (*state_change)(struct sock *);
+	void (*write_space)(struct sock *);
+};
+
+struct nvmet_tcp_port {
+	struct socket		*sock;
+	struct work_struct	accept_work;
+	struct nvmet_port	*nport;
+	struct sockaddr_storage addr;
+	int			last_cpu;
+	void (*data_ready)(struct sock *);
+};
+
+static DEFINE_IDA(nvmet_tcp_queue_ida);
+static LIST_HEAD(nvmet_tcp_queue_list);
+static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
+
+static struct workqueue_struct *nvmet_tcp_wq;
+static struct nvmet_fabrics_ops nvmet_tcp_ops;
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
+
+static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd)
+{
+	return cmd - queue->cmds;
+}
+
+static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) &&
+		cmd->rbytes_done < cmd->req.transfer_len;
+}
+
+static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
+{
+	return nvmet_tcp_has_data_in(cmd) && !cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
+{
+	return !nvme_is_write(cmd->req.cmd) &&
+		cmd->req.transfer_len > 0 &&
+		!cmd->req.rsp->status;
+}
+
+static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
+{
+	return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
+		!cmd->rbytes_done;
+}
+
+static inline struct nvmet_tcp_cmd *
+nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = list_first_entry_or_null(&queue->free_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!cmd)
+		return NULL;
+	list_del_init(&cmd->entry);
+
+	cmd->rbytes_done = cmd->wbytes_done = 0;
+	cmd->pdu_len = 0;
+	cmd->pdu_recv = 0;
+	cmd->iov = NULL;
+	cmd->flags = 0;
+	return cmd;
+}
+
+static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	if (unlikely(cmd == &cmd->queue->connect))
+		return;
+
+	list_add_tail(&cmd->entry, &cmd->queue->free_list);
+}
+
+static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
+	void *pdu, size_t len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		pr_err("queue %d: header digest enabled but no header digest\n",
+			queue->idx);
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		pr_err("queue %d: header digest error: recv %#x expected %#x\n",
+			queue->idx, le32_to_cpu(recv_digest),
+			le32_to_cpu(exp_digest));
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u8 digest_len = nvmet_tcp_hdgst_len(queue);
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		(hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		pr_err("queue %d: data digest flag is cleared\n", queue->idx);
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct scatterlist *sg;
+	int i;
+
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	for (i = 0; i < cmd->nr_mapped; i++)
+		kunmap(sg_page(&sg[i]));
+}
+
+static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
+{
+	struct kvec *iov = cmd->iov;
+	struct scatterlist *sg;
+	u32 length, offset, sg_offset;
+
+	length = cmd->pdu_len;
+	cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
+	offset = cmd->rbytes_done;
+	cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
+	sg_offset = offset % PAGE_SIZE;
+	sg = &cmd->req.sg[cmd->sg_idx];
+
+	while (length) {
+		u32 iov_len = min_t(u32, length, sg->length - sg_offset);
+
+		iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
+		iov->iov_len = iov_len;
+
+		length -= iov_len;
+		sg = sg_next(sg);
+		iov++;
+	}
+
+	iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
+		cmd->nr_mapped, cmd->pdu_len);
+}
+
+static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
+{
+	queue->rcv_state = NVMET_TCP_RECV_ERR;
+	if (queue->nvme_sq.ctrl)
+		nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
+	else
+		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+}
+
+static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
+	u32 len = le32_to_cpu(sgl->length);
+
+	if (!cmd->req.data_len)
+		return 0;
+
+	if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
+			  NVME_SGL_FMT_OFFSET)) {
+		if (!nvme_is_write(cmd->req.cmd))
+			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+
+		if (len > cmd->req.port->inline_data_size)
+			return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
+		cmd->pdu_len = len;
+	}
+	cmd->req.transfer_len += len;
+
+	cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
+	if (!cmd->req.sg)
+		return NVME_SC_INTERNAL;
+	cmd->cur_sg = cmd->req.sg;
+
+	if (nvmet_tcp_has_data_in(cmd)) {
+		cmd->iov = kmalloc_array(cmd->req.sg_cnt,
+				sizeof(*cmd->iov), GFP_KERNEL);
+		if (!cmd->iov)
+			goto err;
+	}
+
+	return 0;
+err:
+	sgl_free(cmd->req.sg);
+	return NVME_SC_INTERNAL;
+}
+
+static void nvmet_tcp_ddgst(struct ahash_request *hash,
+		struct nvmet_tcp_cmd *cmd)
+{
+	ahash_request_set_crypt(hash, cmd->req.sg,
+		(void *)&cmd->exp_ddgst, cmd->req.transfer_len);
+	crypto_ahash_digest(hash);
+}
+
+static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_DATA_PDU;
+
+	pdu->hdr.type = nvme_tcp_c2h_data;
+	pdu->hdr.flags = NVME_TCP_F_DATA_LAST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst +
+				cmd->req.transfer_len + ddgst);
+	pdu->command_id = cmd->req.rsp->command_id;
+	pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
+	pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
+
+	if (queue->data_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		nvmet_tcp_ddgst(queue->snd_hash, cmd);
+	}
+
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_R2T;
+
+	pdu->hdr.type = nvme_tcp_r2t;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	pdu->command_id = cmd->req.cmd->common.command_id;
+	pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
+	pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
+	pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+
+	cmd->offset = 0;
+	cmd->state = NVMET_TCP_SEND_RESPONSE;
+
+	pdu->hdr.type = nvme_tcp_rsp;
+	pdu->hdr.flags = 0;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = 0;
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+	if (cmd->queue->hdr_digest) {
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+		nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+	}
+}
+
+static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
+{
+	struct llist_node *node;
+
+	node = llist_del_all(&queue->resp_list);
+	if (!node)
+		return;
+
+	while (node) {
+		struct nvmet_tcp_cmd *cmd = llist_entry(node,
+					struct nvmet_tcp_cmd, lentry);
+
+		list_add(&cmd->entry, &queue->resp_send_list);
+		node = node->next;
+		queue->send_list_len++;
+	}
+}
+
+static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
+{
+	queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
+				struct nvmet_tcp_cmd, entry);
+	if (!queue->snd_cmd) {
+		nvmet_tcp_process_resp_list(queue);
+		queue->snd_cmd =
+			list_first_entry_or_null(&queue->resp_send_list,
+					struct nvmet_tcp_cmd, entry);
+		if (unlikely(!queue->snd_cmd))
+			return NULL;
+	}
+
+	list_del_init(&queue->snd_cmd->entry);
+	queue->send_list_len--;
+
+	if (nvmet_tcp_need_data_out(queue->snd_cmd))
+		nvmet_setup_c2h_data_pdu(queue->snd_cmd);
+	else if (nvmet_tcp_need_data_in(queue->snd_cmd))
+		nvmet_setup_r2t_pdu(queue->snd_cmd);
+	else
+		nvmet_setup_response_pdu(queue->snd_cmd);
+
+	return queue->snd_cmd;
+}
+
+static void nvmet_tcp_queue_response(struct nvmet_req *req)
+{
+	struct nvmet_tcp_cmd *cmd =
+		container_of(req, struct nvmet_tcp_cmd, req);
+	struct nvmet_tcp_queue	*queue = cmd->queue;
+
+	llist_add(&cmd->lentry, &queue->resp_list);
+	queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
+}
+
+static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
+	int ret;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
+			offset_in_page(cmd->data_pdu) + cmd->offset,
+			left, MSG_DONTWAIT | MSG_MORE);
+	if (ret <= 0)
+		return ret;
+
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->state = NVMET_TCP_SEND_DATA;
+	cmd->offset  = 0;
+	return 1;
+}
+
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	int ret;
+
+	while (cmd->cur_sg) {
+		struct page *page = sg_page(cmd->cur_sg);
+		u32 left = cmd->cur_sg->length - cmd->offset;
+
+		ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
+					left, MSG_DONTWAIT | MSG_MORE);
+		if (ret <= 0)
+			return ret;
+
+		cmd->offset += ret;
+		cmd->wbytes_done += ret;
+
+		/* Done with sg?*/
+		if (cmd->offset == cmd->cur_sg->length) {
+			cmd->cur_sg = sg_next(cmd->cur_sg);
+			cmd->offset = 0;
+		}
+	}
+
+	if (queue->data_digest) {
+		cmd->state = NVMET_TCP_SEND_DDGST;
+		cmd->offset = 0;
+	} else {
+		nvmet_setup_response_pdu(cmd);
+	}
+	return 1;
+
+}
+
+static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
+		bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
+		offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	kfree(cmd->iov);
+	sgl_free(cmd->req.sg);
+	cmd->queue->snd_cmd = NULL;
+	nvmet_tcp_put_cmd(cmd);
+	return 1;
+}
+
+static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
+	int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
+	int flags = MSG_DONTWAIT;
+	int ret;
+
+	if (!last_in_batch && cmd->queue->send_list_len)
+		flags |= MSG_MORE;
+	else
+		flags |= MSG_EOR;
+
+	ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
+		offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
+	if (ret <= 0)
+		return ret;
+	cmd->offset += ret;
+	left -= ret;
+
+	if (left)
+		return -EAGAIN;
+
+	cmd->queue->snd_cmd = NULL;
+	return 1;
+}
+
+static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = &cmd->exp_ddgst + cmd->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
+	};
+	int ret;
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	cmd->offset += ret;
+	nvmet_setup_response_pdu(cmd);
+	return 1;
+}
+
+static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
+		bool last_in_batch)
+{
+	struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
+	int ret = 0;
+
+	if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
+		cmd = nvmet_tcp_fetch_cmd(queue);
+		if (unlikely(!cmd))
+			return 0;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
+		ret = nvmet_try_send_data_pdu(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DATA) {
+		ret = nvmet_try_send_data(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_DDGST) {
+		ret = nvmet_try_send_ddgst(cmd);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_R2T) {
+		ret = nvmet_try_send_r2t(cmd, last_in_batch);
+		if (ret <= 0)
+			goto done_send;
+	}
+
+	if (cmd->state == NVMET_TCP_SEND_RESPONSE)
+		ret = nvmet_try_send_response(cmd, last_in_batch);
+
+done_send:
+	if (ret < 0) {
+		if (ret == -EAGAIN)
+			return 0;
+		return ret;
+	}
+
+	return 1;
+}
+
+static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
+		int budget, int *sends)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
+		if (ret <= 0)
+			break;
+		(*sends)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
+{
+	queue->offset = 0;
+	queue->left = sizeof(struct nvme_tcp_hdr);
+	queue->cmd = NULL;
+	queue->rcv_state = NVMET_TCP_RECV_PDU;
+}
+
+static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+
+static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
+	struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	int ret;
+
+	if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
+		pr_err("bad nvme-tcp pdu length (%d)\n",
+			le32_to_cpu(icreq->hdr.plen));
+		nvmet_tcp_fatal_error(queue);
+	}
+
+	if (icreq->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
+		return -EPROTO;
+	}
+
+	if (icreq->hpda != 0) {
+		pr_err("queue %d: unsupported hpda %d\n", queue->idx,
+			icreq->hpda);
+		return -EPROTO;
+	}
+
+	if (icreq->maxr2t != 0) {
+		pr_err("queue %d: unsupported maxr2t %d\n", queue->idx,
+			le16_to_cpu(icreq->maxr2t) + 1);
+		return -EPROTO;
+	}
+
+	queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvmet_tcp_alloc_crypto(queue);
+		if (ret)
+			return ret;
+	}
+
+	memset(icresp, 0, sizeof(*icresp));
+	icresp->hdr.type = nvme_tcp_icresp;
+	icresp->hdr.hlen = sizeof(*icresp);
+	icresp->hdr.pdo = 0;
+	icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
+	icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icresp->maxdata = 0xffff; /* FIXME: support r2t */
+	icresp->cpda = 0;
+	if (queue->hdr_digest)
+		icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_crypto;
+
+	queue->state = NVMET_TCP_Q_LIVE;
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+free_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	return ret;
+}
+
+static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
+{
+	int ret;
+
+	/* recover the expected data transfer length */
+	req->data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
+
+	if (!nvme_is_write(cmd->req.cmd) ||
+	    req->data_len > cmd->req.port->inline_data_size) {
+		nvmet_prepare_receive_pdu(queue);
+		return;
+	}
+
+	ret = nvmet_tcp_map_data(cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		nvmet_tcp_fatal_error(queue);
+		return;
+	}
+
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	cmd->flags |= NVMET_TCP_F_INIT_FAILED;
+}
+
+static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_data_pdu *data = &queue->pdu.data;
+	struct nvmet_tcp_cmd *cmd;
+
+	cmd = &queue->cmds[data->ttag];
+
+	if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
+		pr_err("ttag %u unexpected data offset %u (expected %u)\n",
+			data->ttag, le32_to_cpu(data->data_offset),
+			cmd->rbytes_done);
+		/* FIXME: use path and transport errors */
+		nvmet_req_complete(&cmd->req,
+			NVME_SC_INVALID_FIELD | NVME_SC_DNR);
+		return -EPROTO;
+	}
+
+	cmd->pdu_len = le32_to_cpu(data->data_length);
+	cmd->pdu_recv = 0;
+	nvmet_tcp_map_pdu_iovec(cmd);
+	queue->cmd = cmd;
+	queue->rcv_state = NVMET_TCP_RECV_DATA;
+
+	return 0;
+}
+
+static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
+	struct nvmet_req *req;
+	int ret;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		if (hdr->type != nvme_tcp_icreq) {
+			pr_err("unexpected pdu type (%d) before icreq\n",
+				hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EPROTO;
+		}
+		return nvmet_tcp_handle_icreq(queue);
+	}
+
+	if (hdr->type == nvme_tcp_h2c_data) {
+		ret = nvmet_tcp_handle_h2c_data_pdu(queue);
+		if (unlikely(ret))
+			return ret;
+		return 0;
+	}
+
+	queue->cmd = nvmet_tcp_get_cmd(queue);
+	if (unlikely(!queue->cmd)) {
+		/* This should never happen */
+		pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
+			queue->idx, queue->nr_cmds, queue->send_list_len,
+			nvme_cmd->common.opcode);
+		nvmet_tcp_fatal_error(queue);
+		return -ENOMEM;
+	}
+
+	req = &queue->cmd->req;
+	memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
+
+	if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
+			&queue->nvme_sq, &nvmet_tcp_ops))) {
+		pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
+			req->cmd, req->cmd->common.command_id,
+			req->cmd->common.opcode,
+			le32_to_cpu(req->cmd->common.dptr.sgl.length));
+
+		nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
+		return -EAGAIN;
+	}
+
+	ret = nvmet_tcp_map_data(queue->cmd);
+	if (unlikely(ret)) {
+		pr_err("queue %d: failed to map data\n", queue->idx);
+		if (nvmet_tcp_has_inline_data(queue->cmd))
+			nvmet_tcp_fatal_error(queue);
+		else
+			nvmet_req_complete(req, ret);
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	if (nvmet_tcp_need_data_in(queue->cmd)) {
+		if (nvmet_tcp_has_inline_data(queue->cmd)) {
+			queue->rcv_state = NVMET_TCP_RECV_DATA;
+			nvmet_tcp_map_pdu_iovec(queue->cmd);
+			return 0;
+		}
+		/* send back R2T */
+		nvmet_tcp_queue_response(&queue->cmd->req);
+		goto out;
+	}
+
+	nvmet_req_execute(&queue->cmd->req);
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static const u8 nvme_tcp_pdu_sizes[] = {
+	[nvme_tcp_icreq]	= sizeof(struct nvme_tcp_icreq_pdu),
+	[nvme_tcp_cmd]		= sizeof(struct nvme_tcp_cmd_pdu),
+	[nvme_tcp_h2c_data]	= sizeof(struct nvme_tcp_data_pdu),
+};
+
+static inline u8 nvmet_tcp_pdu_size(u8 type)
+{
+	size_t idx = type;
+
+	return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
+		nvme_tcp_pdu_sizes[idx]) ?
+			nvme_tcp_pdu_sizes[idx] : 0;
+}
+
+static inline bool nvmet_tcp_pdu_valid(u8 type)
+{
+	switch (type) {
+	case nvme_tcp_icreq:
+	case nvme_tcp_cmd:
+	case nvme_tcp_h2c_data:
+		/* fallthru */
+		return true;
+	}
+
+	return false;
+}
+
+static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
+{
+	struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
+	int len;
+	struct kvec iov;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+
+recv:
+	iov.iov_base = (void *)&queue->pdu + queue->offset;
+	iov.iov_len = queue->left;
+	len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(len < 0))
+		return len;
+
+	queue->offset += len;
+	queue->left -= len;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
+		u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+		if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
+			pr_err("unexpected pdu type %d\n", hdr->type);
+			nvmet_tcp_fatal_error(queue);
+			return -EIO;
+		}
+
+		if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
+			pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
+			return -EIO;
+		}
+
+		queue->left = hdr->hlen - queue->offset + hdgst;
+		goto recv;
+	}
+
+	if (queue->hdr_digest &&
+	    nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	if (queue->data_digest &&
+	    nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
+		nvmet_tcp_fatal_error(queue); /* fatal */
+		return -EPROTO;
+	}
+
+	return nvmet_tcp_done_recv_pdu(queue);
+}
+
+static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
+{
+	struct nvmet_tcp_queue *queue = cmd->queue;
+
+	nvmet_tcp_ddgst(queue->rcv_hash, cmd);
+	queue->offset = 0;
+	queue->left = NVME_TCP_DIGEST_LENGTH;
+	queue->rcv_state = NVMET_TCP_RECV_DDGST;
+}
+
+static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd  *cmd = queue->cmd;
+	int ret;
+
+	while (msg_data_left(&cmd->recv_msg)) {
+		ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
+			cmd->recv_msg.msg_flags);
+		if (ret <= 0)
+			return ret;
+
+		cmd->pdu_recv += ret;
+		cmd->rbytes_done += ret;
+	}
+
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len) {
+		if (queue->data_digest) {
+			nvmet_tcp_prep_recv_ddgst(cmd);
+			return 0;
+		}
+		nvmet_req_execute(&cmd->req);
+	}
+
+	nvmet_prepare_receive_pdu(queue);
+	return 0;
+}
+
+static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmd;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
+	struct kvec iov = {
+		.iov_base = (void *)&cmd->recv_ddgst + queue->offset,
+		.iov_len = queue->left
+	};
+
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (unlikely(ret < 0))
+		return ret;
+
+	queue->offset += ret;
+	queue->left -= ret;
+	if (queue->left)
+		return -EAGAIN;
+
+	if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
+		pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
+			queue->idx, cmd->req.cmd->common.command_id,
+			queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
+			le32_to_cpu(cmd->exp_ddgst));
+		nvmet_tcp_finish_cmd(cmd);
+		nvmet_tcp_fatal_error(queue);
+		ret = -EPROTO;
+		goto out;
+	}
+
+	if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
+	    cmd->rbytes_done == cmd->req.transfer_len)
+		nvmet_req_execute(&cmd->req);
+	ret = 0;
+out:
+	nvmet_prepare_receive_pdu(queue);
+	return ret;
+}
+
+static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
+{
+	int result;
+
+	if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
+		return 0;
+
+	if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
+		result = nvmet_tcp_try_recv_pdu(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
+		result = nvmet_tcp_try_recv_data(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+	if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
+		result = nvmet_tcp_try_recv_ddgst(queue);
+		if (result != 0)
+			goto done_recv;
+	}
+
+done_recv:
+	if (result < 0) {
+		if (result == -EAGAIN)
+			return 0;
+		return result;
+	}
+	return 1;
+}
+
+static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
+		int budget, int *recvs)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < budget; i++) {
+		ret = nvmet_tcp_try_recv_one(queue);
+		if (ret <= 0)
+			break;
+		(*recvs)++;
+	}
+
+	return ret;
+}
+
+static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
+{
+	spin_lock(&queue->state_lock);
+	if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
+		queue->state = NVMET_TCP_Q_DISCONNECTING;
+		schedule_work(&queue->release_work);
+	}
+	spin_unlock(&queue->state_lock);
+}
+
+static void nvmet_tcp_io_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, io_work);
+	bool pending;
+	int ret, ops = 0;
+
+	do {
+		pending = false;
+
+		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
+		if (ret > 0) {
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
+		if (ret > 0) {
+			/* transmitted message/data */
+			pending = true;
+		} else if (ret < 0) {
+			if (ret == -EPIPE || ret == -ECONNRESET)
+				kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+			else
+				nvmet_tcp_fatal_error(queue);
+			return;
+		}
+
+	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+
+	/*
+	 * We exahusted our budget, requeue our selves
+	 */
+	if (pending)
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+}
+
+static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
+		struct nvmet_tcp_cmd *c)
+{
+	u8 hdgst = nvmet_tcp_hdgst_len(queue);
+
+	c->queue = queue;
+	c->req.port = queue->port->nport;
+
+	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->cmd_pdu)
+		return -ENOMEM;
+	c->req.cmd = &c->cmd_pdu->cmd;
+
+	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->rsp_pdu)
+		goto out_free_cmd;
+	c->req.rsp = &c->rsp_pdu->cqe;
+
+	c->data_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->data_pdu)
+		goto out_free_rsp;
+
+	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
+			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	if (!c->r2t_pdu)
+		goto out_free_data;
+
+	c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
+
+	list_add_tail(&c->entry, &queue->free_list);
+
+	return 0;
+out_free_data:
+	page_frag_free(c->data_pdu);
+out_free_rsp:
+	page_frag_free(c->rsp_pdu);
+out_free_cmd:
+	page_frag_free(c->cmd_pdu);
+	return -ENOMEM;
+}
+
+static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
+{
+	page_frag_free(c->r2t_pdu);
+	page_frag_free(c->data_pdu);
+	page_frag_free(c->rsp_pdu);
+	page_frag_free(c->cmd_pdu);
+}
+
+static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds;
+	int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
+
+	cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
+	if (!cmds)
+		goto out;
+
+	for (i = 0; i < nr_cmds; i++) {
+		ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
+		if (ret)
+			goto out_free;
+	}
+
+	queue->cmds = cmds;
+
+	return 0;
+out_free:
+	while (--i >= 0)
+		nvmet_tcp_free_cmd(cmds + i);
+	kfree(cmds);
+out:
+	return ret;
+}
+
+static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmds = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++)
+		nvmet_tcp_free_cmd(cmds + i);
+
+	nvmet_tcp_free_cmd(&queue->connect);
+	kfree(cmds);
+}
+
+static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_data_ready =  queue->data_ready;
+	sock->sk->sk_state_change = queue->state_change;
+	sock->sk->sk_write_space = queue->write_space;
+	sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
+{
+	nvmet_req_uninit(&cmd->req);
+	nvmet_tcp_unmap_pdu_iovec(cmd);
+	sgl_free(cmd->req.sg);
+}
+
+static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_cmd *cmd = queue->cmds;
+	int i;
+
+	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
+		if (nvmet_tcp_need_data_in(cmd))
+			nvmet_tcp_finish_cmd(cmd);
+	}
+
+	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
+		/* failed in connect */
+		nvmet_tcp_finish_cmd(&queue->connect);
+	}
+}
+
+static void nvmet_tcp_release_queue_work(struct work_struct *w)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(w, struct nvmet_tcp_queue, release_work);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	nvmet_tcp_restore_socket_callbacks(queue);
+	flush_work(&queue->io_work);
+
+	nvmet_tcp_uninit_data_in_cmds(queue);
+	nvmet_sq_destroy(&queue->nvme_sq);
+	cancel_work_sync(&queue->io_work);
+	sock_release(queue->sock);
+	nvmet_tcp_free_cmds(queue);
+	if (queue->hdr_digest || queue->data_digest)
+		nvmet_tcp_free_crypto(queue);
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+
+	kfree(queue);
+}
+
+static void nvmet_tcp_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue))
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_write_space(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (unlikely(!queue))
+		goto out;
+
+	if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
+		queue->write_space(sk);
+		goto out;
+	}
+
+	if (sk_stream_is_writeable(sk)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+	}
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvmet_tcp_state_change(struct sock *sk)
+{
+	struct nvmet_tcp_queue *queue;
+
+	write_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_FIN_WAIT1:
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSE:
+		/* FALLTHRU */
+		sk->sk_user_data = NULL;
+		nvmet_tcp_schedule_release_queue(queue);
+		break;
+	default:
+		pr_warn("queue %d unhandled state %d\n",
+			queue->idx, sk->sk_state);
+	}
+done:
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret;
+
+	ret = kernel_getsockname(sock,
+		(struct sockaddr *)&queue->sockaddr);
+	if (ret < 0)
+		return ret;
+
+	ret = kernel_getpeername(sock,
+		(struct sockaddr *)&queue->sockaddr_peer);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret)
+		return ret;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data = queue;
+	queue->data_ready = sock->sk->sk_data_ready;
+	sock->sk->sk_data_ready = nvmet_tcp_data_ready;
+	queue->state_change = sock->sk->sk_state_change;
+	sock->sk->sk_state_change = nvmet_tcp_state_change;
+	queue->write_space = sock->sk->sk_write_space;
+	sock->sk->sk_write_space = nvmet_tcp_write_space;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+
+	return 0;
+}
+
+static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
+		struct socket *newsock)
+{
+	struct nvmet_tcp_queue *queue;
+	int ret;
+
+	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
+	if (!queue)
+		return -ENOMEM;
+
+	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
+	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
+	queue->sock = newsock;
+	queue->port = port;
+	queue->nr_cmds = 0;
+	spin_lock_init(&queue->state_lock);
+	queue->state = NVMET_TCP_Q_CONNECTING;
+	INIT_LIST_HEAD(&queue->free_list);
+	init_llist_head(&queue->resp_list);
+	INIT_LIST_HEAD(&queue->resp_send_list);
+
+	queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
+	if (queue->idx < 0) {
+		ret = queue->idx;
+		goto out_free_queue;
+	}
+
+	ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
+	if (ret)
+		goto out_ida_remove;
+
+	ret = nvmet_sq_init(&queue->nvme_sq);
+	if (ret)
+		goto out_free_connect;
+
+	port->last_cpu = cpumask_next_wrap(port->last_cpu,
+				cpu_online_mask, -1, false);
+	queue->cpu = port->last_cpu;
+	nvmet_prepare_receive_pdu(queue);
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+
+	ret = nvmet_tcp_set_queue_sock(queue);
+	if (ret)
+		goto out_destroy_sq;
+
+	queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
+
+	return 0;
+out_destroy_sq:
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_del_init(&queue->queue_list);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+	nvmet_sq_destroy(&queue->nvme_sq);
+out_free_connect:
+	nvmet_tcp_free_cmd(&queue->connect);
+out_ida_remove:
+	ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
+out_free_queue:
+	kfree(queue);
+	return ret;
+}
+
+static void nvmet_tcp_accept_work(struct work_struct *w)
+{
+	struct nvmet_tcp_port *port =
+		container_of(w, struct nvmet_tcp_port, accept_work);
+	struct socket *newsock;
+	int ret;
+
+	while (true) {
+		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
+		if (ret < 0) {
+			if (ret != -EAGAIN)
+				pr_warn("failed to accept err=%d\n", ret);
+			return;
+		}
+		ret = nvmet_tcp_alloc_queue(port, newsock);
+		if (ret) {
+			pr_err("failed to allocate queue\n");
+			sock_release(newsock);
+		}
+	}
+}
+
+static void nvmet_tcp_listen_data_ready(struct sock *sk)
+{
+	struct nvmet_tcp_port *port;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	port = sk->sk_user_data;
+	if (!port)
+		goto out;
+
+	if (sk->sk_state == TCP_LISTEN)
+		schedule_work(&port->accept_work);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static int nvmet_tcp_add_port(struct nvmet_port *nport)
+{
+	struct nvmet_tcp_port *port;
+	__kernel_sa_family_t af;
+	int opt, ret;
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port)
+		return -ENOMEM;
+
+	switch (nport->disc_addr.adrfam) {
+	case NVMF_ADDR_FAMILY_IP4:
+		af = AF_INET;
+		break;
+	case NVMF_ADDR_FAMILY_IP6:
+		af = AF_INET6;
+		break;
+	default:
+		pr_err("address family %d not supported\n",
+				nport->disc_addr.adrfam);
+		ret = -EINVAL;
+		goto err_port;
+	}
+
+	ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
+			nport->disc_addr.trsvcid, &port->addr);
+	if (ret) {
+		pr_err("malformed ip/port passed: %s:%s\n",
+			nport->disc_addr.traddr, nport->disc_addr.trsvcid);
+		goto err_port;
+	}
+
+	port->nport = nport;
+	port->last_cpu = -1;
+	INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
+	if (port->nport->inline_data_size < 0)
+		port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
+
+	ret = sock_create(port->addr.ss_family, SOCK_STREAM,
+				IPPROTO_TCP, &port->sock);
+	if (ret) {
+		pr_err("failed to create a socket\n");
+		goto err_port;
+	}
+
+	port->sock->sk->sk_user_data = port;
+	port->data_ready = port->sock->sk->sk_data_ready;
+	port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
+
+	opt = 1;
+	ret = kernel_setsockopt(port->sock, IPPROTO_TCP,
+			TCP_NODELAY, (char *)&opt, sizeof(opt));
+	if (ret) {
+		pr_err("failed to set TCP_NODELAY sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR,
+			(char *)&opt, sizeof(opt));
+	if (ret) {
+		pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
+			sizeof(port->addr));
+	if (ret) {
+		pr_err("failed to bind port socket %d\n", ret);
+		goto err_sock;
+	}
+
+	ret = kernel_listen(port->sock, 128);
+	if (ret) {
+		pr_err("failed to listen %d on port sock\n", ret);
+		goto err_sock;
+	}
+
+	nport->priv = port;
+	pr_info("enabling port %d (%pISpc)\n",
+		le16_to_cpu(nport->disc_addr.portid), &port->addr);
+
+	return 0;
+
+err_sock:
+	sock_release(port->sock);
+err_port:
+	kfree(port);
+	return ret;
+}
+
+static void nvmet_tcp_remove_port(struct nvmet_port *nport)
+{
+	struct nvmet_tcp_port *port = nport->priv;
+
+	write_lock_bh(&port->sock->sk->sk_callback_lock);
+	port->sock->sk->sk_data_ready = port->data_ready;
+	port->sock->sk->sk_user_data = NULL;
+	write_unlock_bh(&port->sock->sk->sk_callback_lock);
+	cancel_work_sync(&port->accept_work);
+
+	sock_release(port->sock);
+	kfree(port);
+}
+
+static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
+{
+	struct nvmet_tcp_queue *queue;
+
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+		if (queue->nvme_sq.ctrl == ctrl)
+			kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+}
+
+static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
+{
+	struct nvmet_tcp_queue *queue =
+		container_of(sq, struct nvmet_tcp_queue, nvme_sq);
+
+	if (sq->qid == 0) {
+		/* Let inflight controller teardown complete */
+		flush_scheduled_work();
+	}
+
+	queue->nr_cmds = sq->size * 2;
+	if (nvmet_tcp_alloc_cmds(queue))
+		return NVME_SC_INTERNAL;
+	return 0;
+}
+
+static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
+		struct nvmet_port *nport, char *traddr)
+{
+	struct nvmet_tcp_port *port = nport->priv;
+
+	if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
+		struct nvmet_tcp_cmd *cmd =
+			container_of(req, struct nvmet_tcp_cmd, req);
+		struct nvmet_tcp_queue *queue = cmd->queue;
+
+		sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
+	} else {
+		memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
+	}
+}
+
+static struct nvmet_fabrics_ops nvmet_tcp_ops = {
+	.owner			= THIS_MODULE,
+	.type			= NVMF_TRTYPE_TCP,
+	.msdbd			= 1,
+	.has_keyed_sgls		= 0,
+	.add_port		= nvmet_tcp_add_port,
+	.remove_port		= nvmet_tcp_remove_port,
+	.queue_response		= nvmet_tcp_queue_response,
+	.delete_ctrl		= nvmet_tcp_delete_ctrl,
+	.install_queue		= nvmet_tcp_install_queue,
+	.disc_traddr		= nvmet_tcp_disc_port_addr,
+};
+
+static int __init nvmet_tcp_init(void)
+{
+	int ret;
+
+	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
+	if (!nvmet_tcp_wq)
+		return -ENOMEM;
+
+	ret = nvmet_register_transport(&nvmet_tcp_ops);
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	destroy_workqueue(nvmet_tcp_wq);
+	return ret;
+}
+
+static void __exit nvmet_tcp_exit(void)
+{
+	struct nvmet_tcp_queue *queue;
+
+	nvmet_unregister_transport(&nvmet_tcp_ops);
+
+	flush_scheduled_work();
+	mutex_lock(&nvmet_tcp_queue_mutex);
+	list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
+		kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	mutex_unlock(&nvmet_tcp_queue_mutex);
+	flush_scheduled_work();
+
+	destroy_workqueue(nvmet_tcp_wq);
+}
+
+module_init(nvmet_tcp_init);
+module_exit(nvmet_tcp_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */

From ad4f530e95a7b88a332035b9e0c5384441356576 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:16 -0800
Subject: [PATCH 292/351] nvmet: allow configfs tcp trtype configuration

Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/configfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index db2cb64be7ba..618bbd006544 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -34,6 +34,7 @@ static const struct nvmet_transport_name {
 } nvmet_transport_names[] = {
 	{ NVMF_TRTYPE_RDMA,	"rdma" },
 	{ NVMF_TRTYPE_FC,	"fc" },
+	{ NVMF_TRTYPE_TCP,	"tcp" },
 	{ NVMF_TRTYPE_LOOP,	"loop" },
 };
 

From 3f2304f8c6d6ed97849057bd16fee99e434ca796 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@lightbitslabs.com>
Date: Mon, 3 Dec 2018 17:52:17 -0800
Subject: [PATCH 293/351] nvme-tcp: add NVMe over TCP host driver

This patch implements the NVMe over TCP host driver. It can be used to
connect to remote NVMe over Fabrics subsystems over good old TCP/IP.

The driver implements the TP 8000 of how nvme over fabrics capsules and
data are encapsulated in nvme-tcp pdus and exchaged on top of a TCP byte
stream. nvme-tcp header and data digest are supported as well.

To connect to all NVMe over Fabrics controllers reachable on a given taget
port over TCP use the following command:

	nvme connect-all -t tcp -a $IPADDR

This requires the latest version of nvme-cli with TCP support.

Signed-off-by: Sagi Grimberg <sagi@lightbitslabs.com>
Signed-off-by: Roy Shterman <roys@lightbitslabs.com>
Signed-off-by: Solganik Alexander <sashas@lightbitslabs.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/Kconfig  |   15 +
 drivers/nvme/host/Makefile |    3 +
 drivers/nvme/host/tcp.c    | 2242 ++++++++++++++++++++++++++++++++++++
 3 files changed, 2260 insertions(+)
 create mode 100644 drivers/nvme/host/tcp.c

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 88a8b5916624..0f345e207675 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -57,3 +57,18 @@ config NVME_FC
 	  from https://github.com/linux-nvme/nvme-cli.
 
 	  If unsure, say N.
+
+config NVME_TCP
+	tristate "NVM Express over Fabrics TCP host driver"
+	depends on INET
+	depends on BLK_DEV_NVME
+	select NVME_FABRICS
+	help
+	  This provides support for the NVMe over Fabrics protocol using
+	  the TCP transport.  This allows you to use remote block devices
+	  exported using the NVMe protocol set.
+
+	  To configure a NVMe over Fabrics controller use the nvme-cli tool
+	  from https://github.com/linux-nvme/nvme-cli.
+
+	  If unsure, say N.
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index aea459c65ae1..8a4b671c5f0c 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
 obj-$(CONFIG_NVME_RDMA)			+= nvme-rdma.o
 obj-$(CONFIG_NVME_FC)			+= nvme-fc.o
+obj-$(CONFIG_NVME_TCP)			+= nvme-tcp.o
 
 nvme-core-y				:= core.o
 nvme-core-$(CONFIG_TRACING)		+= trace.o
@@ -21,3 +22,5 @@ nvme-fabrics-y				+= fabrics.o
 nvme-rdma-y				+= rdma.o
 
 nvme-fc-y				+= fc.o
+
+nvme-tcp-y				+= tcp.o
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
new file mode 100644
index 000000000000..15543358e245
--- /dev/null
+++ b/drivers/nvme/host/tcp.c
@@ -0,0 +1,2242 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics TCP host.
+ * Copyright (c) 2018 Lightbits Labs. All rights reserved.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/nvme-tcp.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <linux/blk-mq.h>
+#include <crypto/hash.h>
+
+#include "nvme.h"
+#include "fabrics.h"
+
+struct nvme_tcp_queue;
+
+enum nvme_tcp_send_state {
+	NVME_TCP_SEND_CMD_PDU = 0,
+	NVME_TCP_SEND_H2C_PDU,
+	NVME_TCP_SEND_DATA,
+	NVME_TCP_SEND_DDGST,
+};
+
+struct nvme_tcp_request {
+	struct nvme_request	req;
+	void			*pdu;
+	struct nvme_tcp_queue	*queue;
+	u32			data_len;
+	u32			pdu_len;
+	u32			pdu_sent;
+	u16			ttag;
+	struct list_head	entry;
+	u32			ddgst;
+
+	struct bio		*curr_bio;
+	struct iov_iter		iter;
+
+	/* send state */
+	size_t			offset;
+	size_t			data_sent;
+	enum nvme_tcp_send_state state;
+};
+
+enum nvme_tcp_queue_flags {
+	NVME_TCP_Q_ALLOCATED	= 0,
+	NVME_TCP_Q_LIVE		= 1,
+};
+
+enum nvme_tcp_recv_state {
+	NVME_TCP_RECV_PDU = 0,
+	NVME_TCP_RECV_DATA,
+	NVME_TCP_RECV_DDGST,
+};
+
+struct nvme_tcp_ctrl;
+struct nvme_tcp_queue {
+	struct socket		*sock;
+	struct work_struct	io_work;
+	int			io_cpu;
+
+	spinlock_t		lock;
+	struct list_head	send_list;
+
+	/* recv state */
+	void			*pdu;
+	int			pdu_remaining;
+	int			pdu_offset;
+	size_t			data_remaining;
+	size_t			ddgst_remaining;
+
+	/* send state */
+	struct nvme_tcp_request *request;
+
+	int			queue_size;
+	size_t			cmnd_capsule_len;
+	struct nvme_tcp_ctrl	*ctrl;
+	unsigned long		flags;
+	bool			rd_enabled;
+
+	bool			hdr_digest;
+	bool			data_digest;
+	struct ahash_request	*rcv_hash;
+	struct ahash_request	*snd_hash;
+	__le32			exp_ddgst;
+	__le32			recv_ddgst;
+
+	struct page_frag_cache	pf_cache;
+
+	void (*state_change)(struct sock *);
+	void (*data_ready)(struct sock *);
+	void (*write_space)(struct sock *);
+};
+
+struct nvme_tcp_ctrl {
+	/* read only in the hot path */
+	struct nvme_tcp_queue	*queues;
+	struct blk_mq_tag_set	tag_set;
+
+	/* other member variables */
+	struct list_head	list;
+	struct blk_mq_tag_set	admin_tag_set;
+	struct sockaddr_storage addr;
+	struct sockaddr_storage src_addr;
+	struct nvme_ctrl	ctrl;
+
+	struct work_struct	err_work;
+	struct delayed_work	connect_work;
+	struct nvme_tcp_request async_req;
+};
+
+static LIST_HEAD(nvme_tcp_ctrl_list);
+static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
+static struct workqueue_struct *nvme_tcp_wq;
+static struct blk_mq_ops nvme_tcp_mq_ops;
+static struct blk_mq_ops nvme_tcp_admin_mq_ops;
+
+static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
+{
+	return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
+}
+
+static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
+{
+	return queue - queue->ctrl->queues;
+}
+
+static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
+{
+	u32 queue_idx = nvme_tcp_queue_id(queue);
+
+	if (queue_idx == 0)
+		return queue->ctrl->admin_tag_set.tags[queue_idx];
+	return queue->ctrl->tag_set.tags[queue_idx - 1];
+}
+
+static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
+{
+	return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
+static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
+{
+	return queue->cmnd_capsule_len - sizeof(struct nvme_command);
+}
+
+static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
+{
+	return req == &req->queue->ctrl->async_req;
+}
+
+static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
+{
+	struct request *rq;
+	unsigned int bytes;
+
+	if (unlikely(nvme_tcp_async_req(req)))
+		return false; /* async events don't have a request */
+
+	rq = blk_mq_rq_from_pdu(req);
+	bytes = blk_rq_payload_bytes(rq);
+
+	return rq_data_dir(rq) == WRITE && bytes &&
+		bytes <= nvme_tcp_inline_data_size(req->queue);
+}
+
+static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_page;
+}
+
+static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.bvec->bv_offset + req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
+{
+	return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
+			req->pdu_len - req->pdu_sent);
+}
+
+static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
+{
+	return req->iter.iov_offset;
+}
+
+static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
+{
+	return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
+			req->pdu_len - req->pdu_sent : 0;
+}
+
+static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
+		int len)
+{
+	return nvme_tcp_pdu_data_left(req) <= len;
+}
+
+static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
+		unsigned int dir)
+{
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	struct bio_vec *vec;
+	unsigned int size;
+	int nsegs;
+	size_t offset;
+
+	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
+		vec = &rq->special_vec;
+		nsegs = 1;
+		size = blk_rq_payload_bytes(rq);
+		offset = 0;
+	} else {
+		struct bio *bio = req->curr_bio;
+
+		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		nsegs = bio_segments(bio);
+		size = bio->bi_iter.bi_size;
+		offset = bio->bi_iter.bi_bvec_done;
+	}
+
+	iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
+	req->iter.iov_offset = offset;
+}
+
+static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
+		int len)
+{
+	req->data_sent += len;
+	req->pdu_sent += len;
+	iov_iter_advance(&req->iter, len);
+	if (!iov_iter_count(&req->iter) &&
+	    req->data_sent < req->data_len) {
+		req->curr_bio = req->curr_bio->bi_next;
+		nvme_tcp_init_iter(req, WRITE);
+	}
+}
+
+static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	spin_lock(&queue->lock);
+	list_add_tail(&req->entry, &queue->send_list);
+	spin_unlock(&queue->lock);
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static inline struct nvme_tcp_request *
+nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+
+	spin_lock(&queue->lock);
+	req = list_first_entry_or_null(&queue->send_list,
+			struct nvme_tcp_request, entry);
+	if (req)
+		list_del(&req->entry);
+	spin_unlock(&queue->lock);
+
+	return req;
+}
+
+static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, u32 *dgst)
+{
+	ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
+	crypto_ahash_final(hash);
+}
+
+static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
+		struct page *page, off_t off, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_marker(&sg, 1);
+	sg_set_page(&sg, page, len, off);
+	ahash_request_set_crypt(hash, &sg, NULL, len);
+	crypto_ahash_update(hash);
+}
+
+static inline void nvme_tcp_hdgst(struct ahash_request *hash,
+		void *pdu, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, pdu, len);
+	ahash_request_set_crypt(hash, &sg, pdu + len, len);
+	crypto_ahash_digest(hash);
+}
+
+static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
+		void *pdu, size_t pdu_len)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	__le32 recv_digest;
+	__le32 exp_digest;
+
+	if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: header digest flag is cleared\n",
+			nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+
+	recv_digest = *(__le32 *)(pdu + hdr->hlen);
+	nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
+	exp_digest = *(__le32 *)(pdu + hdr->hlen);
+	if (recv_digest != exp_digest) {
+		dev_err(queue->ctrl->ctrl.device,
+			"header digest error: recv %#x expected %#x\n",
+			le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
+{
+	struct nvme_tcp_hdr *hdr = pdu;
+	u8 digest_len = nvme_tcp_hdgst_len(queue);
+	u32 len;
+
+	len = le32_to_cpu(hdr->plen) - hdr->hlen -
+		((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
+
+	if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d: data digest flag is cleared\n",
+		nvme_tcp_queue_id(queue));
+		return -EPROTO;
+	}
+	crypto_ahash_init(queue->rcv_hash);
+
+	return 0;
+}
+
+static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+
+	page_frag_free(req->pdu);
+}
+
+static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
+		struct request *rq, unsigned int hctx_idx,
+		unsigned int numa_node)
+{
+	struct nvme_tcp_ctrl *ctrl = set->driver_data;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
+	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	req->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!req->pdu)
+		return -ENOMEM;
+
+	req->queue = queue;
+	nvme_req(rq)->ctrl = &ctrl->ctrl;
+
+	return 0;
+}
+
+static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+		unsigned int hctx_idx)
+{
+	struct nvme_tcp_ctrl *ctrl = data;
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+
+	hctx->driver_data = queue;
+	return 0;
+}
+
+static enum nvme_tcp_recv_state
+nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
+{
+	return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
+		(queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
+		NVME_TCP_RECV_DATA;
+}
+
+static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
+{
+	queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
+				nvme_tcp_hdgst_len(queue);
+	queue->pdu_offset = 0;
+	queue->data_remaining = -1;
+	queue->ddgst_remaining = 0;
+}
+
+static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
+{
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		return;
+
+	queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+}
+
+static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
+		struct nvme_completion *cqe)
+{
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag 0x%x not found\n",
+			nvme_tcp_queue_id(queue), cqe->command_id);
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		return -EINVAL;
+	}
+
+	nvme_end_request(rq, cqe->status, cqe->result);
+
+	return 0;
+}
+
+static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_data_pdu *pdu)
+{
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+
+	if (!blk_rq_payload_bytes(rq)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x unexpected data\n",
+			nvme_tcp_queue_id(queue), rq->tag);
+		return -EIO;
+	}
+
+	queue->data_remaining = le32_to_cpu(pdu->data_length);
+
+	return 0;
+
+}
+
+static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_rsp_pdu *pdu)
+{
+	struct nvme_completion *cqe = &pdu->cqe;
+	int ret = 0;
+
+	/*
+	 * AEN requests are special as they don't time out and can
+	 * survive any kind of queue freeze and often don't respond to
+	 * aborts.  We don't even bother to allocate a struct request
+	 * for them but rather special case them here.
+	 */
+	if (unlikely(nvme_tcp_queue_id(queue) == 0 &&
+	    cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH))
+		nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
+				&cqe->result);
+	else
+		ret = nvme_tcp_process_nvme_cqe(queue, cqe);
+
+	return ret;
+}
+
+static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_data_pdu *data = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	u8 ddgst = nvme_tcp_ddgst_len(queue);
+
+	req->pdu_len = le32_to_cpu(pdu->r2t_length);
+	req->pdu_sent = 0;
+
+	if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d r2t len %u exceeded data len %u (%zu sent)\n",
+			rq->tag, req->pdu_len, req->data_len,
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"req %d unexpected r2t offset %u (expected %zu)\n",
+			rq->tag, le32_to_cpu(pdu->r2t_offset),
+			req->data_sent);
+		return -EPROTO;
+	}
+
+	memset(data, 0, sizeof(*data));
+	data->hdr.type = nvme_tcp_h2c_data;
+	data->hdr.flags = NVME_TCP_F_DATA_LAST;
+	if (queue->hdr_digest)
+		data->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest)
+		data->hdr.flags |= NVME_TCP_F_DDGST;
+	data->hdr.hlen = sizeof(*data);
+	data->hdr.pdo = data->hdr.hlen + hdgst;
+	data->hdr.plen =
+		cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
+	data->ttag = pdu->ttag;
+	data->command_id = rq->tag;
+	data->data_offset = cpu_to_le32(req->data_sent);
+	data->data_length = cpu_to_le32(req->pdu_len);
+	return 0;
+}
+
+static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
+		struct nvme_tcp_r2t_pdu *pdu)
+{
+	struct nvme_tcp_request *req;
+	struct request *rq;
+	int ret;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
+	if (unlikely(ret))
+		return ret;
+
+	req->state = NVME_TCP_SEND_H2C_PDU;
+	req->offset = 0;
+
+	nvme_tcp_queue_request(req);
+
+	return 0;
+}
+
+static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+		unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_hdr *hdr;
+	char *pdu = queue->pdu;
+	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset,
+		&pdu[queue->pdu_offset], rcv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->pdu_remaining -= rcv_len;
+	queue->pdu_offset += rcv_len;
+	*offset += rcv_len;
+	*len -= rcv_len;
+	if (queue->pdu_remaining)
+		return 0;
+
+	hdr = queue->pdu;
+	if (queue->hdr_digest) {
+		ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
+		if (unlikely(ret))
+			return ret;
+	}
+
+
+	if (queue->data_digest) {
+		ret = nvme_tcp_check_ddgst(queue, queue->pdu);
+		if (unlikely(ret))
+			return ret;
+	}
+
+	switch (hdr->type) {
+	case nvme_tcp_c2h_data:
+		ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_rsp:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu);
+		break;
+	case nvme_tcp_r2t:
+		nvme_tcp_init_recv_ctx(queue);
+		ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
+		break;
+	default:
+		dev_err(queue->ctrl->ctrl.device,
+			"unsupported pdu type (%d)\n", hdr->type);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
+			      unsigned int *offset, size_t *len)
+{
+	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
+	struct nvme_tcp_request *req;
+	struct request *rq;
+
+	rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
+	if (!rq) {
+		dev_err(queue->ctrl->ctrl.device,
+			"queue %d tag %#x not found\n",
+			nvme_tcp_queue_id(queue), pdu->command_id);
+		return -ENOENT;
+	}
+	req = blk_mq_rq_to_pdu(rq);
+
+	while (true) {
+		int recv_len, ret;
+
+		recv_len = min_t(size_t, *len, queue->data_remaining);
+		if (!recv_len)
+			break;
+
+		if (!iov_iter_count(&req->iter)) {
+			req->curr_bio = req->curr_bio->bi_next;
+
+			/*
+			 * If we don`t have any bios it means that controller
+			 * sent more data than we requested, hence error
+			 */
+			if (!req->curr_bio) {
+				dev_err(queue->ctrl->ctrl.device,
+					"queue %d no space in request %#x",
+					nvme_tcp_queue_id(queue), rq->tag);
+				nvme_tcp_init_recv_ctx(queue);
+				return -EIO;
+			}
+			nvme_tcp_init_iter(req, READ);
+		}
+
+		/* we can read only from what is left in this bio */
+		recv_len = min_t(size_t, recv_len,
+				iov_iter_count(&req->iter));
+
+		if (queue->data_digest)
+			ret = skb_copy_and_hash_datagram_iter(skb, *offset,
+				&req->iter, recv_len, queue->rcv_hash);
+		else
+			ret = skb_copy_datagram_iter(skb, *offset,
+					&req->iter, recv_len);
+		if (ret) {
+			dev_err(queue->ctrl->ctrl.device,
+				"queue %d failed to copy request %#x data",
+				nvme_tcp_queue_id(queue), rq->tag);
+			return ret;
+		}
+
+		*len -= recv_len;
+		*offset += recv_len;
+		queue->data_remaining -= recv_len;
+	}
+
+	if (!queue->data_remaining) {
+		if (queue->data_digest) {
+			nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
+			queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
+		} else {
+			nvme_tcp_init_recv_ctx(queue);
+		}
+	}
+
+	return 0;
+}
+
+static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
+		struct sk_buff *skb, unsigned int *offset, size_t *len)
+{
+	char *ddgst = (char *)&queue->recv_ddgst;
+	size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
+	off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
+	int ret;
+
+	ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
+	if (unlikely(ret))
+		return ret;
+
+	queue->ddgst_remaining -= recv_len;
+	*offset += recv_len;
+	*len -= recv_len;
+	if (queue->ddgst_remaining)
+		return 0;
+
+	if (queue->recv_ddgst != queue->exp_ddgst) {
+		dev_err(queue->ctrl->ctrl.device,
+			"data digest error: recv %#x expected %#x\n",
+			le32_to_cpu(queue->recv_ddgst),
+			le32_to_cpu(queue->exp_ddgst));
+		return -EIO;
+	}
+
+	nvme_tcp_init_recv_ctx(queue);
+	return 0;
+}
+
+static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
+			     unsigned int offset, size_t len)
+{
+	struct nvme_tcp_queue *queue = desc->arg.data;
+	size_t consumed = len;
+	int result;
+
+	while (len) {
+		switch (nvme_tcp_recv_state(queue)) {
+		case NVME_TCP_RECV_PDU:
+			result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DATA:
+			result = nvme_tcp_recv_data(queue, skb, &offset, &len);
+			break;
+		case NVME_TCP_RECV_DDGST:
+			result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
+			break;
+		default:
+			result = -EFAULT;
+		}
+		if (result) {
+			dev_err(queue->ctrl->ctrl.device,
+				"receive failed:  %d\n", result);
+			queue->rd_enabled = false;
+			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+			return result;
+		}
+	}
+
+	return consumed;
+}
+
+static void nvme_tcp_data_ready(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue && queue->rd_enabled))
+		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_write_space(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock_bh(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (likely(queue && sk_stream_is_writeable(sk))) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	}
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void nvme_tcp_state_change(struct sock *sk)
+{
+	struct nvme_tcp_queue *queue;
+
+	read_lock(&sk->sk_callback_lock);
+	queue = sk->sk_user_data;
+	if (!queue)
+		goto done;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+	case TCP_CLOSE_WAIT:
+	case TCP_LAST_ACK:
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* fallthrough */
+		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
+		break;
+	default:
+		dev_info(queue->ctrl->ctrl.device,
+			"queue %d socket state %d\n",
+			nvme_tcp_queue_id(queue), sk->sk_state);
+	}
+
+	queue->state_change(sk);
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
+{
+	queue->request = NULL;
+}
+
+static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
+{
+	union nvme_result res = {};
+
+	nvme_end_request(blk_mq_rq_from_pdu(req),
+		NVME_SC_DATA_XFER_ERROR, res);
+}
+
+static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+
+	while (true) {
+		struct page *page = nvme_tcp_req_cur_page(req);
+		size_t offset = nvme_tcp_req_cur_offset(req);
+		size_t len = nvme_tcp_req_cur_length(req);
+		bool last = nvme_tcp_pdu_last_send(req, len);
+		int ret, flags = MSG_DONTWAIT;
+
+		if (last && !queue->data_digest)
+			flags |= MSG_EOR;
+		else
+			flags |= MSG_MORE;
+
+		ret = kernel_sendpage(queue->sock, page, offset, len, flags);
+		if (ret <= 0)
+			return ret;
+
+		nvme_tcp_advance_req(req, ret);
+		if (queue->data_digest)
+			nvme_tcp_ddgst_update(queue->snd_hash, page,
+					offset, ret);
+
+		/* fully successful last write*/
+		if (last && ret == len) {
+			if (queue->data_digest) {
+				nvme_tcp_ddgst_final(queue->snd_hash,
+					&req->ddgst);
+				req->state = NVME_TCP_SEND_DDGST;
+				req->offset = 0;
+			} else {
+				nvme_tcp_done_send_req(queue);
+			}
+			return 1;
+		}
+	}
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	bool inline_data = nvme_tcp_has_inline_data(req);
+	int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR);
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) + hdgst - req->offset;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,  flags);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		if (inline_data) {
+			req->state = NVME_TCP_SEND_DATA;
+			if (queue->data_digest)
+				crypto_ahash_init(queue->snd_hash);
+			nvme_tcp_init_iter(req, WRITE);
+		} else {
+			nvme_tcp_done_send_req(queue);
+		}
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	struct nvme_tcp_data_pdu *pdu = req->pdu;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+	int len = sizeof(*pdu) - req->offset + hdgst;
+	int ret;
+
+	if (queue->hdr_digest && !req->offset)
+		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
+
+	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+			offset_in_page(pdu) + req->offset, len,
+			MSG_DONTWAIT | MSG_MORE);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	len -= ret;
+	if (!len) {
+		req->state = NVME_TCP_SEND_DATA;
+		if (queue->data_digest)
+			crypto_ahash_init(queue->snd_hash);
+		if (!req->data_sent)
+			nvme_tcp_init_iter(req, WRITE);
+		return 1;
+	}
+	req->offset += ret;
+
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
+{
+	struct nvme_tcp_queue *queue = req->queue;
+	int ret;
+	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
+	struct kvec iov = {
+		.iov_base = &req->ddgst + req->offset,
+		.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
+	};
+
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (unlikely(ret <= 0))
+		return ret;
+
+	if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
+		nvme_tcp_done_send_req(queue);
+		return 1;
+	}
+
+	req->offset += ret;
+	return -EAGAIN;
+}
+
+static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_request *req;
+	int ret = 1;
+
+	if (!queue->request) {
+		queue->request = nvme_tcp_fetch_request(queue);
+		if (!queue->request)
+			return 0;
+	}
+	req = queue->request;
+
+	if (req->state == NVME_TCP_SEND_CMD_PDU) {
+		ret = nvme_tcp_try_send_cmd_pdu(req);
+		if (ret <= 0)
+			goto done;
+		if (!nvme_tcp_has_inline_data(req))
+			return ret;
+	}
+
+	if (req->state == NVME_TCP_SEND_H2C_PDU) {
+		ret = nvme_tcp_try_send_data_pdu(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DATA) {
+		ret = nvme_tcp_try_send_data(req);
+		if (ret <= 0)
+			goto done;
+	}
+
+	if (req->state == NVME_TCP_SEND_DDGST)
+		ret = nvme_tcp_try_send_ddgst(req);
+done:
+	if (ret == -EAGAIN)
+		ret = 0;
+	return ret;
+}
+
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+{
+	struct sock *sk = queue->sock->sk;
+	read_descriptor_t rd_desc;
+	int consumed;
+
+	rd_desc.arg.data = queue;
+	rd_desc.count = 1;
+	lock_sock(sk);
+	consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+	release_sock(sk);
+	return consumed;
+}
+
+static void nvme_tcp_io_work(struct work_struct *w)
+{
+	struct nvme_tcp_queue *queue =
+		container_of(w, struct nvme_tcp_queue, io_work);
+	unsigned long start = jiffies + msecs_to_jiffies(1);
+
+	do {
+		bool pending = false;
+		int result;
+
+		result = nvme_tcp_try_send(queue);
+		if (result > 0) {
+			pending = true;
+		} else if (unlikely(result < 0)) {
+			dev_err(queue->ctrl->ctrl.device,
+				"failed to send request %d\n", result);
+			if (result != -EPIPE)
+				nvme_tcp_fail_request(queue->request);
+			nvme_tcp_done_send_req(queue);
+			return;
+		}
+
+		result = nvme_tcp_try_recv(queue);
+		if (result > 0)
+			pending = true;
+
+		if (!pending)
+			return;
+
+	} while (time_after(jiffies, start)); /* quota is exhausted */
+
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
+static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
+
+	ahash_request_free(queue->rcv_hash);
+	ahash_request_free(queue->snd_hash);
+	crypto_free_ahash(tfm);
+}
+
+static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
+{
+	struct crypto_ahash *tfm;
+
+	tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return PTR_ERR(tfm);
+
+	queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->snd_hash)
+		goto free_tfm;
+	ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
+
+	queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
+	if (!queue->rcv_hash)
+		goto free_snd_hash;
+	ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
+
+	return 0;
+free_snd_hash:
+	ahash_request_free(queue->snd_hash);
+free_tfm:
+	crypto_free_ahash(tfm);
+	return -ENOMEM;
+}
+
+static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_request *async = &ctrl->async_req;
+
+	page_frag_free(async->pdu);
+}
+
+static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
+{
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_request *async = &ctrl->async_req;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	async->pdu = page_frag_alloc(&queue->pf_cache,
+		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+		GFP_KERNEL | __GFP_ZERO);
+	if (!async->pdu)
+		return -ENOMEM;
+
+	async->queue = &ctrl->queues[0];
+	return 0;
+}
+
+static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
+		return;
+
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+
+	sock_release(queue->sock);
+	kfree(queue->pdu);
+}
+
+static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
+{
+	struct nvme_tcp_icreq_pdu *icreq;
+	struct nvme_tcp_icresp_pdu *icresp;
+	struct msghdr msg = {};
+	struct kvec iov;
+	bool ctrl_hdgst, ctrl_ddgst;
+	int ret;
+
+	icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
+	if (!icreq)
+		return -ENOMEM;
+
+	icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
+	if (!icresp) {
+		ret = -ENOMEM;
+		goto free_icreq;
+	}
+
+	icreq->hdr.type = nvme_tcp_icreq;
+	icreq->hdr.hlen = sizeof(*icreq);
+	icreq->hdr.pdo = 0;
+	icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
+	icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
+	icreq->maxr2t = 0; /* single inflight r2t supported */
+	icreq->hpda = 0; /* no alignment constraint */
+	if (queue->hdr_digest)
+		icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
+	if (queue->data_digest)
+		icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
+
+	iov.iov_base = icreq;
+	iov.iov_len = sizeof(*icreq);
+	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	if (ret < 0)
+		goto free_icresp;
+
+	memset(&msg, 0, sizeof(msg));
+	iov.iov_base = icresp;
+	iov.iov_len = sizeof(*icresp);
+	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
+			iov.iov_len, msg.msg_flags);
+	if (ret < 0)
+		goto free_icresp;
+
+	ret = -EINVAL;
+	if (icresp->hdr.type != nvme_tcp_icresp) {
+		pr_err("queue %d: bad type returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.type);
+		goto free_icresp;
+	}
+
+	if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
+		pr_err("queue %d: bad pdu length returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->hdr.plen);
+		goto free_icresp;
+	}
+
+	if (icresp->pfv != NVME_TCP_PFV_1_0) {
+		pr_err("queue %d: bad pfv returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->pfv);
+		goto free_icresp;
+	}
+
+	ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
+	if ((queue->data_digest && !ctrl_ddgst) ||
+	    (!queue->data_digest && ctrl_ddgst)) {
+		pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->data_digest ? "enabled" : "disabled",
+			ctrl_ddgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
+	if ((queue->hdr_digest && !ctrl_hdgst) ||
+	    (!queue->hdr_digest && ctrl_hdgst)) {
+		pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
+			nvme_tcp_queue_id(queue),
+			queue->hdr_digest ? "enabled" : "disabled",
+			ctrl_hdgst ? "enabled" : "disabled");
+		goto free_icresp;
+	}
+
+	if (icresp->cpda != 0) {
+		pr_err("queue %d: unsupported cpda returned %d\n",
+			nvme_tcp_queue_id(queue), icresp->cpda);
+		goto free_icresp;
+	}
+
+	ret = 0;
+free_icresp:
+	kfree(icresp);
+free_icreq:
+	kfree(icreq);
+	return ret;
+}
+
+static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
+		int qid, size_t queue_size)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
+	int ret, opt, rcv_pdu_size;
+
+	queue->ctrl = ctrl;
+	INIT_LIST_HEAD(&queue->send_list);
+	spin_lock_init(&queue->lock);
+	INIT_WORK(&queue->io_work, nvme_tcp_io_work);
+	queue->queue_size = queue_size;
+
+	if (qid > 0)
+		queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
+	else
+		queue->cmnd_capsule_len = sizeof(struct nvme_command) +
+						NVME_TCP_ADMIN_CCSZ;
+
+	ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
+			IPPROTO_TCP, &queue->sock);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to create socket: %d\n", ret);
+		return ret;
+	}
+
+	/* Single syn retry */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT,
+			(char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_SYNCNT sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/* Set TCP no delay */
+	opt = 1;
+	ret = kernel_setsockopt(queue->sock, IPPROTO_TCP,
+			TCP_NODELAY, (char *)&opt, sizeof(opt));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set TCP_NODELAY sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	/*
+	 * Cleanup whatever is sitting in the TCP transmit queue on socket
+	 * close. This is done to prevent stale data from being sent should
+	 * the network connection be restored before TCP times out.
+	 */
+	ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER,
+			(char *)&sol, sizeof(sol));
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to set SO_LINGER sock opt %d\n", ret);
+		goto err_sock;
+	}
+
+	queue->sock->sk->sk_allocation = GFP_ATOMIC;
+	queue->io_cpu = (qid == 0) ? 0 : qid - 1;
+	queue->request = NULL;
+	queue->data_remaining = 0;
+	queue->ddgst_remaining = 0;
+	queue->pdu_remaining = 0;
+	queue->pdu_offset = 0;
+	sk_set_memalloc(queue->sock->sk);
+
+	if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
+			sizeof(ctrl->src_addr));
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to bind queue %d socket %d\n",
+				qid, ret);
+			goto err_sock;
+		}
+	}
+
+	queue->hdr_digest = nctrl->opts->hdr_digest;
+	queue->data_digest = nctrl->opts->data_digest;
+	if (queue->hdr_digest || queue->data_digest) {
+		ret = nvme_tcp_alloc_crypto(queue);
+		if (ret) {
+			dev_err(ctrl->ctrl.device,
+				"failed to allocate queue %d crypto\n", qid);
+			goto err_sock;
+		}
+	}
+
+	rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
+			nvme_tcp_hdgst_len(queue);
+	queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
+	if (!queue->pdu) {
+		ret = -ENOMEM;
+		goto err_crypto;
+	}
+
+	dev_dbg(ctrl->ctrl.device, "connecting queue %d\n",
+			nvme_tcp_queue_id(queue));
+
+	ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
+		sizeof(ctrl->addr), 0);
+	if (ret) {
+		dev_err(ctrl->ctrl.device,
+			"failed to connect socket: %d\n", ret);
+		goto err_rcv_pdu;
+	}
+
+	ret = nvme_tcp_init_connection(queue);
+	if (ret)
+		goto err_init_connect;
+
+	queue->rd_enabled = true;
+	set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
+	nvme_tcp_init_recv_ctx(queue);
+
+	write_lock_bh(&queue->sock->sk->sk_callback_lock);
+	queue->sock->sk->sk_user_data = queue;
+	queue->state_change = queue->sock->sk->sk_state_change;
+	queue->data_ready = queue->sock->sk->sk_data_ready;
+	queue->write_space = queue->sock->sk->sk_write_space;
+	queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
+	queue->sock->sk->sk_state_change = nvme_tcp_state_change;
+	queue->sock->sk->sk_write_space = nvme_tcp_write_space;
+	write_unlock_bh(&queue->sock->sk->sk_callback_lock);
+
+	return 0;
+
+err_init_connect:
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+err_rcv_pdu:
+	kfree(queue->pdu);
+err_crypto:
+	if (queue->hdr_digest || queue->data_digest)
+		nvme_tcp_free_crypto(queue);
+err_sock:
+	sock_release(queue->sock);
+	queue->sock = NULL;
+	return ret;
+}
+
+static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
+{
+	struct socket *sock = queue->sock;
+
+	write_lock_bh(&sock->sk->sk_callback_lock);
+	sock->sk->sk_user_data  = NULL;
+	sock->sk->sk_data_ready = queue->data_ready;
+	sock->sk->sk_state_change = queue->state_change;
+	sock->sk->sk_write_space  = queue->write_space;
+	write_unlock_bh(&sock->sk->sk_callback_lock);
+}
+
+static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
+{
+	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
+	nvme_tcp_restore_sock_calls(queue);
+	cancel_work_sync(&queue->io_work);
+}
+
+static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
+
+	if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
+		return;
+
+	__nvme_tcp_stop_queue(queue);
+}
+
+static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	int ret;
+
+	if (idx)
+		ret = nvmf_connect_io_queue(nctrl, idx);
+	else
+		ret = nvmf_connect_admin_queue(nctrl);
+
+	if (!ret) {
+		set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
+	} else {
+		__nvme_tcp_stop_queue(&ctrl->queues[idx]);
+		dev_err(nctrl->device,
+			"failed to connect queue: %d ret=%d\n", idx, ret);
+	}
+	return ret;
+}
+
+static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
+		bool admin)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+	struct blk_mq_tag_set *set;
+	int ret;
+
+	if (admin) {
+		set = &ctrl->admin_tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_admin_mq_ops;
+		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
+		set->reserved_tags = 2; /* connect + keep-alive */
+		set->numa_node = NUMA_NO_NODE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = 1;
+		set->timeout = ADMIN_TIMEOUT;
+	} else {
+		set = &ctrl->tag_set;
+		memset(set, 0, sizeof(*set));
+		set->ops = &nvme_tcp_mq_ops;
+		set->queue_depth = nctrl->sqsize + 1;
+		set->reserved_tags = 1; /* fabric connect */
+		set->numa_node = NUMA_NO_NODE;
+		set->flags = BLK_MQ_F_SHOULD_MERGE;
+		set->cmd_size = sizeof(struct nvme_tcp_request);
+		set->driver_data = ctrl;
+		set->nr_hw_queues = nctrl->queue_count - 1;
+		set->timeout = NVME_IO_TIMEOUT;
+	}
+
+	ret = blk_mq_alloc_tag_set(set);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return set;
+}
+
+static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
+{
+	if (to_tcp_ctrl(ctrl)->async_req.pdu) {
+		nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
+		to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
+	}
+
+	nvme_tcp_free_queue(ctrl, 0);
+}
+
+static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_free_queue(ctrl, i);
+}
+
+static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i;
+
+	for (i = 1; i < ctrl->queue_count; i++)
+		nvme_tcp_stop_queue(ctrl, i);
+}
+
+static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret = 0;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_start_queue(ctrl, i);
+		if (ret)
+			goto out_stop_queues;
+	}
+
+	return 0;
+
+out_stop_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_stop_queue(ctrl, i);
+	return ret;
+}
+
+static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
+{
+	int ret;
+
+	ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
+	if (ret)
+		return ret;
+
+	ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
+	if (ret)
+		goto out_free_queue;
+
+	return 0;
+
+out_free_queue:
+	nvme_tcp_free_queue(ctrl, 0);
+	return ret;
+}
+
+static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	int i, ret;
+
+	for (i = 1; i < ctrl->queue_count; i++) {
+		ret = nvme_tcp_alloc_queue(ctrl, i,
+				ctrl->sqsize + 1);
+		if (ret)
+			goto out_free_queues;
+	}
+
+	return 0;
+
+out_free_queues:
+	for (i--; i >= 1; i--)
+		nvme_tcp_free_queue(ctrl, i);
+
+	return ret;
+}
+
+static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
+{
+	return min(ctrl->queue_count - 1, num_online_cpus());
+}
+
+static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
+{
+	unsigned int nr_io_queues;
+	int ret;
+
+	nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
+	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
+	if (ret)
+		return ret;
+
+	ctrl->queue_count = nr_io_queues + 1;
+	if (ctrl->queue_count < 2)
+		return 0;
+
+	dev_info(ctrl->device,
+		"creating %d I/O queues.\n", nr_io_queues);
+
+	return nvme_tcp_alloc_io_queues(ctrl);
+}
+
+static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_io_queues(ctrl);
+	if (remove) {
+		if (ctrl->ops->flags & NVME_F_FABRICS)
+			blk_cleanup_queue(ctrl->connect_q);
+		blk_mq_free_tag_set(ctrl->tagset);
+	}
+	nvme_tcp_free_io_queues(ctrl);
+}
+
+static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
+{
+	int ret;
+
+	ret = nvme_alloc_io_queues(ctrl);
+	if (ret)
+		return ret;
+
+	if (new) {
+		ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
+		if (IS_ERR(ctrl->tagset)) {
+			ret = PTR_ERR(ctrl->tagset);
+			goto out_free_io_queues;
+		}
+
+		if (ctrl->ops->flags & NVME_F_FABRICS) {
+			ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
+			if (IS_ERR(ctrl->connect_q)) {
+				ret = PTR_ERR(ctrl->connect_q);
+				goto out_free_tag_set;
+			}
+		}
+	} else {
+		blk_mq_update_nr_hw_queues(ctrl->tagset,
+			ctrl->queue_count - 1);
+	}
+
+	ret = nvme_tcp_start_io_queues(ctrl);
+	if (ret)
+		goto out_cleanup_connect_q;
+
+	return 0;
+
+out_cleanup_connect_q:
+	if (new && (ctrl->ops->flags & NVME_F_FABRICS))
+		blk_cleanup_queue(ctrl->connect_q);
+out_free_tag_set:
+	if (new)
+		blk_mq_free_tag_set(ctrl->tagset);
+out_free_io_queues:
+	nvme_tcp_free_io_queues(ctrl);
+	return ret;
+}
+
+static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
+{
+	nvme_tcp_stop_queue(ctrl, 0);
+	if (remove) {
+		free_opal_dev(ctrl->opal_dev);
+		blk_cleanup_queue(ctrl->admin_q);
+		blk_mq_free_tag_set(ctrl->admin_tagset);
+	}
+	nvme_tcp_free_admin_queue(ctrl);
+}
+
+static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
+{
+	int error;
+
+	error = nvme_tcp_alloc_admin_queue(ctrl);
+	if (error)
+		return error;
+
+	if (new) {
+		ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
+		if (IS_ERR(ctrl->admin_tagset)) {
+			error = PTR_ERR(ctrl->admin_tagset);
+			goto out_free_queue;
+		}
+
+		ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
+		if (IS_ERR(ctrl->admin_q)) {
+			error = PTR_ERR(ctrl->admin_q);
+			goto out_free_tagset;
+		}
+	}
+
+	error = nvme_tcp_start_queue(ctrl, 0);
+	if (error)
+		goto out_cleanup_queue;
+
+	error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+	if (error) {
+		dev_err(ctrl->device,
+			"prop_get NVME_REG_CAP failed\n");
+		goto out_stop_queue;
+	}
+
+	ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize);
+
+	error = nvme_enable_ctrl(ctrl, ctrl->cap);
+	if (error)
+		goto out_stop_queue;
+
+	error = nvme_init_identify(ctrl);
+	if (error)
+		goto out_stop_queue;
+
+	return 0;
+
+out_stop_queue:
+	nvme_tcp_stop_queue(ctrl, 0);
+out_cleanup_queue:
+	if (new)
+		blk_cleanup_queue(ctrl->admin_q);
+out_free_tagset:
+	if (new)
+		blk_mq_free_tag_set(ctrl->admin_tagset);
+out_free_queue:
+	nvme_tcp_free_admin_queue(ctrl);
+	return error;
+}
+
+static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	blk_mq_quiesce_queue(ctrl->admin_q);
+	nvme_tcp_stop_queue(ctrl, 0);
+	blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl);
+	blk_mq_unquiesce_queue(ctrl->admin_q);
+	nvme_tcp_destroy_admin_queue(ctrl, remove);
+}
+
+static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
+		bool remove)
+{
+	if (ctrl->queue_count <= 1)
+		return;
+	nvme_stop_queues(ctrl);
+	nvme_tcp_stop_io_queues(ctrl);
+	blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl);
+	if (remove)
+		nvme_start_queues(ctrl);
+	nvme_tcp_destroy_io_queues(ctrl, remove);
+}
+
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+{
+	/* If we are resetting/deleting then do nothing */
+	if (ctrl->state != NVME_CTRL_CONNECTING) {
+		WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
+			ctrl->state == NVME_CTRL_LIVE);
+		return;
+	}
+
+	if (nvmf_should_reconnect(ctrl)) {
+		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
+			ctrl->opts->reconnect_delay);
+		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
+				ctrl->opts->reconnect_delay * HZ);
+	} else {
+		dev_info(ctrl->device, "Removing controller...\n");
+		nvme_delete_ctrl(ctrl);
+	}
+}
+
+static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
+{
+	struct nvmf_ctrl_options *opts = ctrl->opts;
+	int ret = -EINVAL;
+
+	ret = nvme_tcp_configure_admin_queue(ctrl, new);
+	if (ret)
+		return ret;
+
+	if (ctrl->icdoff) {
+		dev_err(ctrl->device, "icdoff is not supported!\n");
+		goto destroy_admin;
+	}
+
+	if (opts->queue_size > ctrl->sqsize + 1)
+		dev_warn(ctrl->device,
+			"queue_size %zu > ctrl sqsize %u, clamping down\n",
+			opts->queue_size, ctrl->sqsize + 1);
+
+	if (ctrl->sqsize + 1 > ctrl->maxcmd) {
+		dev_warn(ctrl->device,
+			"sqsize %u > ctrl maxcmd %u, clamping down\n",
+			ctrl->sqsize + 1, ctrl->maxcmd);
+		ctrl->sqsize = ctrl->maxcmd - 1;
+	}
+
+	if (ctrl->queue_count > 1) {
+		ret = nvme_tcp_configure_io_queues(ctrl, new);
+		if (ret)
+			goto destroy_admin;
+	}
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		ret = -EINVAL;
+		goto destroy_io;
+	}
+
+	nvme_start_ctrl(ctrl);
+	return 0;
+
+destroy_io:
+	if (ctrl->queue_count > 1)
+		nvme_tcp_destroy_io_queues(ctrl, new);
+destroy_admin:
+	nvme_tcp_stop_queue(ctrl, 0);
+	nvme_tcp_destroy_admin_queue(ctrl, new);
+	return ret;
+}
+
+static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+			struct nvme_tcp_ctrl, connect_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	++ctrl->nr_reconnects;
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto requeue;
+
+	dev_info(ctrl->device, "Successfully reconnected (%d attepmpt)\n",
+			ctrl->nr_reconnects);
+
+	ctrl->nr_reconnects = 0;
+
+	return;
+
+requeue:
+	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
+			ctrl->nr_reconnects);
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_error_recovery_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+				struct nvme_tcp_ctrl, err_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	nvme_stop_keep_alive(ctrl);
+	nvme_tcp_teardown_io_queues(ctrl, false);
+	/* unquiesce to fail fast pending requests */
+	nvme_start_queues(ctrl);
+	nvme_tcp_teardown_admin_queue(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
+{
+	nvme_tcp_teardown_io_queues(ctrl, shutdown);
+	if (shutdown)
+		nvme_shutdown_ctrl(ctrl);
+	else
+		nvme_disable_ctrl(ctrl, ctrl->cap);
+	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+}
+
+static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
+{
+	nvme_tcp_teardown_ctrl(ctrl, true);
+}
+
+static void nvme_reset_ctrl_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl =
+		container_of(work, struct nvme_ctrl, reset_work);
+
+	nvme_stop_ctrl(ctrl);
+	nvme_tcp_teardown_ctrl(ctrl, false);
+
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
+		/* state change failure is ok if we're in DELETING state */
+		WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
+		return;
+	}
+
+	if (nvme_tcp_setup_ctrl(ctrl, false))
+		goto out_fail;
+
+	return;
+
+out_fail:
+	++ctrl->nr_reconnects;
+	nvme_tcp_reconnect_or_remove(ctrl);
+}
+
+static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
+{
+	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
+	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
+}
+
+static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
+
+	if (list_empty(&ctrl->list))
+		goto free_ctrl;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_del(&ctrl->list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	nvmf_free_options(nctrl->opts);
+free_ctrl:
+	kfree(ctrl->queues);
+	kfree(ctrl);
+}
+
+static void nvme_tcp_set_sg_null(struct nvme_command *c)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = 0;
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
+		struct nvme_command *c, u32 data_len)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
+	sg->length = cpu_to_le32(data_len);
+	sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
+}
+
+static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
+		u32 data_len)
+{
+	struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
+
+	sg->addr = 0;
+	sg->length = cpu_to_le32(data_len);
+	sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
+			NVME_SGL_FMT_TRANSPORT_A;
+}
+
+static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
+{
+	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
+	struct nvme_tcp_queue *queue = &ctrl->queues[0];
+	struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
+	struct nvme_command *cmd = &pdu->cmd;
+	u8 hdgst = nvme_tcp_hdgst_len(queue);
+
+	memset(pdu, 0, sizeof(*pdu));
+	pdu->hdr.type = nvme_tcp_cmd;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
+
+	cmd->common.opcode = nvme_admin_async_event;
+	cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
+	cmd->common.flags |= NVME_CMD_SGL_METABUF;
+	nvme_tcp_set_sg_null(cmd);
+
+	ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
+	ctrl->async_req.offset = 0;
+	ctrl->async_req.curr_bio = NULL;
+	ctrl->async_req.data_len = 0;
+
+	nvme_tcp_queue_request(&ctrl->async_req);
+}
+
+static enum blk_eh_timer_return
+nvme_tcp_timeout(struct request *rq, bool reserved)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+
+	dev_dbg(ctrl->ctrl.device,
+		"queue %d: timeout request %#x type %d\n",
+		nvme_tcp_queue_id(req->queue), rq->tag,
+		pdu->hdr.type);
+
+	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
+		union nvme_result res = {};
+
+		nvme_req(rq)->flags |= NVME_REQ_CANCELLED;
+		nvme_end_request(rq, NVME_SC_ABORT_REQ, res);
+		return BLK_EH_DONE;
+	}
+
+	/* queue error recovery */
+	nvme_tcp_error_recovery(&ctrl->ctrl);
+
+	return BLK_EH_RESET_TIMER;
+}
+
+static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
+			struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_command *c = &pdu->cmd;
+
+	c->common.flags |= NVME_CMD_SGL_METABUF;
+
+	if (rq_data_dir(rq) == WRITE && req->data_len &&
+	    req->data_len <= nvme_tcp_inline_data_size(queue))
+		nvme_tcp_set_sg_inline(queue, c, req->data_len);
+	else
+		nvme_tcp_set_sg_host_data(c, req->data_len);
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
+		struct request *rq)
+{
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+	struct nvme_tcp_queue *queue = req->queue;
+	u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
+	blk_status_t ret;
+
+	ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
+	if (ret)
+		return ret;
+
+	req->state = NVME_TCP_SEND_CMD_PDU;
+	req->offset = 0;
+	req->data_sent = 0;
+	req->pdu_len = 0;
+	req->pdu_sent = 0;
+	req->data_len = blk_rq_payload_bytes(rq);
+	req->curr_bio = rq->bio;
+
+	if (rq_data_dir(rq) == WRITE &&
+	    req->data_len <= nvme_tcp_inline_data_size(queue))
+		req->pdu_len = req->data_len;
+	else if (req->curr_bio)
+		nvme_tcp_init_iter(req, READ);
+
+	pdu->hdr.type = nvme_tcp_cmd;
+	pdu->hdr.flags = 0;
+	if (queue->hdr_digest)
+		pdu->hdr.flags |= NVME_TCP_F_HDGST;
+	if (queue->data_digest && req->pdu_len) {
+		pdu->hdr.flags |= NVME_TCP_F_DDGST;
+		ddgst = nvme_tcp_ddgst_len(queue);
+	}
+	pdu->hdr.hlen = sizeof(*pdu);
+	pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
+	pdu->hdr.plen =
+		cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
+
+	ret = nvme_tcp_map_data(queue, rq);
+	if (unlikely(ret)) {
+		dev_err(queue->ctrl->ctrl.device,
+			"Failed to map data (%d)\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct nvme_ns *ns = hctx->queue->queuedata;
+	struct nvme_tcp_queue *queue = hctx->driver_data;
+	struct request *rq = bd->rq;
+	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
+	bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
+	blk_status_t ret;
+
+	if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
+		return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
+
+	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
+	if (unlikely(ret))
+		return ret;
+
+	blk_mq_start_request(rq);
+
+	nvme_tcp_queue_request(req);
+
+	return BLK_STS_OK;
+}
+
+static struct blk_mq_ops nvme_tcp_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
+	.queue_rq	= nvme_tcp_queue_rq,
+	.complete	= nvme_complete_rq,
+	.init_request	= nvme_tcp_init_request,
+	.exit_request	= nvme_tcp_exit_request,
+	.init_hctx	= nvme_tcp_init_admin_hctx,
+	.timeout	= nvme_tcp_timeout,
+};
+
+static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
+	.name			= "tcp",
+	.module			= THIS_MODULE,
+	.flags			= NVME_F_FABRICS,
+	.reg_read32		= nvmf_reg_read32,
+	.reg_read64		= nvmf_reg_read64,
+	.reg_write32		= nvmf_reg_write32,
+	.free_ctrl		= nvme_tcp_free_ctrl,
+	.submit_async_event	= nvme_tcp_submit_async_event,
+	.delete_ctrl		= nvme_tcp_delete_ctrl,
+	.get_address		= nvmf_get_address,
+	.stop_ctrl		= nvme_tcp_stop_ctrl,
+};
+
+static bool
+nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	bool found = false;
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
+		found = nvmf_ip_options_match(&ctrl->ctrl, opts);
+		if (found)
+			break;
+	}
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return found;
+}
+
+static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
+		struct nvmf_ctrl_options *opts)
+{
+	struct nvme_tcp_ctrl *ctrl;
+	int ret;
+
+	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	if (!ctrl)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&ctrl->list);
+	ctrl->ctrl.opts = opts;
+	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.sqsize = opts->queue_size - 1;
+	ctrl->ctrl.kato = opts->kato;
+
+	INIT_DELAYED_WORK(&ctrl->connect_work,
+			nvme_tcp_reconnect_ctrl_work);
+	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
+
+	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
+		opts->trsvcid =
+			kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
+		if (!opts->trsvcid) {
+			ret = -ENOMEM;
+			goto out_free_ctrl;
+		}
+		opts->mask |= NVMF_OPT_TRSVCID;
+	}
+
+	ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->traddr, opts->trsvcid, &ctrl->addr);
+	if (ret) {
+		pr_err("malformed address passed: %s:%s\n",
+			opts->traddr, opts->trsvcid);
+		goto out_free_ctrl;
+	}
+
+	if (opts->mask & NVMF_OPT_HOST_TRADDR) {
+		ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
+			opts->host_traddr, NULL, &ctrl->src_addr);
+		if (ret) {
+			pr_err("malformed src address passed: %s\n",
+			       opts->host_traddr);
+			goto out_free_ctrl;
+		}
+	}
+
+	if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
+		ret = -EALREADY;
+		goto out_free_ctrl;
+	}
+
+	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+				GFP_KERNEL);
+	if (!ctrl->queues) {
+		ret = -ENOMEM;
+		goto out_free_ctrl;
+	}
+
+	ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
+	if (ret)
+		goto out_kfree_queues;
+
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
+		WARN_ON_ONCE(1);
+		ret = -EINTR;
+		goto out_uninit_ctrl;
+	}
+
+	ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
+	if (ret)
+		goto out_uninit_ctrl;
+
+	dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
+		ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
+
+	nvme_get_ctrl(&ctrl->ctrl);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+
+	return &ctrl->ctrl;
+
+out_uninit_ctrl:
+	nvme_uninit_ctrl(&ctrl->ctrl);
+	nvme_put_ctrl(&ctrl->ctrl);
+	if (ret > 0)
+		ret = -EIO;
+	return ERR_PTR(ret);
+out_kfree_queues:
+	kfree(ctrl->queues);
+out_free_ctrl:
+	kfree(ctrl);
+	return ERR_PTR(ret);
+}
+
+static struct nvmf_transport_ops nvme_tcp_transport = {
+	.name		= "tcp",
+	.module		= THIS_MODULE,
+	.required_opts	= NVMF_OPT_TRADDR,
+	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
+	.create_ctrl	= nvme_tcp_create_ctrl,
+};
+
+static int __init nvme_tcp_init_module(void)
+{
+	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
+			WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+	if (!nvme_tcp_wq)
+		return -ENOMEM;
+
+	nvmf_register_transport(&nvme_tcp_transport);
+	return 0;
+}
+
+static void __exit nvme_tcp_cleanup_module(void)
+{
+	struct nvme_tcp_ctrl *ctrl;
+
+	nvmf_unregister_transport(&nvme_tcp_transport);
+
+	mutex_lock(&nvme_tcp_ctrl_mutex);
+	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
+		nvme_delete_ctrl(&ctrl->ctrl);
+	mutex_unlock(&nvme_tcp_ctrl_mutex);
+	flush_workqueue(nvme_delete_wq);
+
+	destroy_workqueue(nvme_tcp_wq);
+}
+
+module_init(nvme_tcp_init_module);
+module_exit(nvme_tcp_cleanup_module);
+
+MODULE_LICENSE("GPL v2");

From 8eb5d89f483141dd076529bf5f6aa235b425886e Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Tue, 11 Dec 2018 07:24:34 +0800
Subject: [PATCH 294/351] nvme: add __exit annotation

Add __exit annotation to cleanup helper which is only
called once in the module.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 2 +-
 drivers/nvme/host/nvme.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c71e879821ad..3043cedb24e1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3802,7 +3802,7 @@ out:
 	return result;
 }
 
-void nvme_core_exit(void)
+void __exit nvme_core_exit(void)
 {
 	ida_destroy(&nvme_subsystems_ida);
 	class_destroy(nvme_subsys_class);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e20e737ac10c..c78a9dc7fdcf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -565,6 +565,6 @@ static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
 }
 
 int __init nvme_core_init(void);
-void nvme_core_exit(void);
+void __exit nvme_core_exit(void);
 
 #endif /* _NVME_H */

From cb5b7262b011cfb793519bf97e54dff5282da23c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 12 Dec 2018 09:18:11 -0700
Subject: [PATCH 295/351] nvme: provide fallback for discard alloc failure

When boxes are run near (or to) OOM, we have a problem with the discard
page allocation in nvme. If we fail allocating the special page, we
return busy, and it'll get retried. But since ordering is honored for
dispatch requests, we can keep retrying this same IO and failing. Behind
that IO could be requests that want to free memory, but they never get
the chance.

Allocate a fixed discard page per controller for a safe fallback, and use
that if the initial allocation fails.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 41 ++++++++++++++++++++++++++++++++++------
 drivers/nvme/host/nvme.h |  3 +++
 2 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3043cedb24e1..168f2c1eaf60 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -564,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	struct nvme_dsm_range *range;
 	struct bio *bio;
 
-	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
-	if (!range)
-		return BLK_STS_RESOURCE;
+	range = kmalloc_array(segments, sizeof(*range),
+				GFP_ATOMIC | __GFP_NOWARN);
+	if (!range) {
+		/*
+		 * If we fail allocation our range, fallback to the controller
+		 * discard page. If that's also busy, it's safe to return
+		 * busy, as we know we can make progress once that's freed.
+		 */
+		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+			return BLK_STS_RESOURCE;
+
+		range = page_address(ns->ctrl->discard_page);
+	}
 
 	__rq_for_each_bio(bio, req) {
 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -581,7 +591,10 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	}
 
 	if (WARN_ON_ONCE(n != segments)) {
-		kfree(range);
+		if (virt_to_page(range) == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(range);
 		return BLK_STS_IOERR;
 	}
 
@@ -664,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
 				blk_rq_bytes(req) >> ns->lba_shift);
 	}
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		kfree(page_address(req->special_vec.bv_page) +
-		      req->special_vec.bv_offset);
+		struct nvme_ns *ns = req->rq_disk->private_data;
+		struct page *page = req->special_vec.bv_page;
+
+		if (page == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(page_address(page) + req->special_vec.bv_offset);
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -3578,6 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
+	kfree(ctrl->discard_page);
 
 	if (subsys) {
 		mutex_lock(&subsys->lock);
@@ -3618,6 +3637,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 
+	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+			PAGE_SIZE);
+	ctrl->discard_page = alloc_page(GFP_KERNEL);
+	if (!ctrl->discard_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
@@ -3655,6 +3682,8 @@ out_free_name:
 out_release_instance:
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
+	if (ctrl->discard_page)
+		__free_page(ctrl->discard_page);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index c78a9dc7fdcf..39b52f4d9b24 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -241,6 +241,9 @@ struct nvme_ctrl {
 	u16 maxcmd;
 	int nr_reconnects;
 	struct nvmf_ctrl_options *opts;
+
+	struct page *discard_page;
+	unsigned long discard_page_busy;
 };
 
 struct nvme_subsystem {

From 16d3a280d4d73eae786ebf2576cd597a031bc9df Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Wed, 12 Dec 2018 23:01:54 -0800
Subject: [PATCH 296/351] nvmet: remove unused variable

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-bdev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index c1cb2ed5531c..e0831230ca27 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -61,7 +61,6 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 	struct bio *bio;
 	struct scatterlist *sg;
 	sector_t sector;
-	blk_qc_t cookie;
 	int op, op_flags = 0, i;
 
 	if (!req->sg_cnt) {
@@ -114,7 +113,7 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 		sg_cnt--;
 	}
 
-	cookie = submit_bio(bio);
+	submit_bio(bio);
 }
 
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)

From b7c8f3663d0e0773aca3324c26bce3ca8343ec14 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:37 -0800
Subject: [PATCH 297/351] nvme: remove nvme_common command cdw10 array

This is a preparation patch which removes the nvme common command cdw10
array and replace with individual fields. This is needed for the nvmet
error log page implementation make is error log page entry offset
assignment easier.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c        | 18 +++++++++---------
 drivers/nvme/host/lightnvm.c    |  6 +++---
 drivers/nvme/host/trace.h       |  4 ++--
 drivers/nvme/target/admin-cmd.c | 12 ++++++------
 drivers/nvme/target/discovery.c |  4 ++--
 drivers/nvme/target/nvmet.h     |  2 +-
 include/linux/nvme.h            |  7 ++++++-
 7 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 168f2c1eaf60..4d8ee7186268 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1283,12 +1283,12 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.common.nsid = cpu_to_le32(cmd.nsid);
 	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
 	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
-	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
-	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
-	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
-	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
-	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
+	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
+	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
+	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
 
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -1649,7 +1649,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 	memset(&c, 0, sizeof(c));
 	c.common.opcode = op;
 	c.common.nsid = cpu_to_le32(ns->head->ns_id);
-	c.common.cdw10[0] = cpu_to_le32(cdw10);
+	c.common.cdw10 = cpu_to_le32(cdw10);
 
 	ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16);
 	nvme_put_ns_from_disk(head, srcu_idx);
@@ -1723,8 +1723,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 	else
 		cmd.common.opcode = nvme_admin_security_recv;
 	cmd.common.nsid = 0;
-	cmd.common.cdw10[0] = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
-	cmd.common.cdw10[1] = cpu_to_le32(len);
+	cmd.common.cdw10 = cpu_to_le32(((u32)secp) << 24 | ((u32)spsp) << 8);
+	cmd.common.cdw11 = cpu_to_le32(len);
 
 	return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
 				      ADMIN_TIMEOUT, NVME_QID_ANY, 1, 0);
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index f145fc0220d6..b759c25c89c8 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -937,9 +937,9 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin,
 	/* cdw11-12 */
 	c.ph_rw.length = cpu_to_le16(vcmd.nppas);
 	c.ph_rw.control  = cpu_to_le16(vcmd.control);
-	c.common.cdw10[3] = cpu_to_le32(vcmd.cdw13);
-	c.common.cdw10[4] = cpu_to_le32(vcmd.cdw14);
-	c.common.cdw10[5] = cpu_to_le32(vcmd.cdw15);
+	c.common.cdw13 = cpu_to_le32(vcmd.cdw13);
+	c.common.cdw14 = cpu_to_le32(vcmd.cdw14);
+	c.common.cdw15 = cpu_to_le32(vcmd.cdw15);
 
 	if (vcmd.timeout_ms)
 		timeout = msecs_to_jiffies(vcmd.timeout_ms);
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
index 196d5bd56718..1978deb6fcc7 100644
--- a/drivers/nvme/host/trace.h
+++ b/drivers/nvme/host/trace.h
@@ -115,8 +115,8 @@ TRACE_EVENT(nvme_setup_cmd,
 		__entry->nsid = le32_to_cpu(cmd->common.nsid);
 		__entry->metadata = le64_to_cpu(cmd->common.metadata);
 		__assign_disk_name(__entry->disk, req->rq_disk);
-		memcpy(__entry->cdw10, cmd->common.cdw10,
-		       sizeof(__entry->cdw10));
+		memcpy(__entry->cdw10, &cmd->common.cdw10,
+			6 * sizeof(__entry->cdw10));
 	    ),
 	    TP_printk("nvme%d: %sqid=%d, cmdid=%u, nsid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
 		      __entry->ctrl_id, __print_disk_name(__entry->disk),
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 753515fc8028..721b041a6b3b 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -557,7 +557,7 @@ static u16 nvmet_write_protect_flush_sync(struct nvmet_req *req)
 
 static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 {
-	u32 write_protect = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 write_protect = le32_to_cpu(req->cmd->common.cdw11);
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
 	u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
 
@@ -589,7 +589,7 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 
 u16 nvmet_set_feat_kato(struct nvmet_req *req)
 {
-	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
 	req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000);
 
@@ -600,7 +600,7 @@ u16 nvmet_set_feat_kato(struct nvmet_req *req)
 
 u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 {
-	u32 val32 = le32_to_cpu(req->cmd->common.cdw10[1]);
+	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
 	if (val32 & ~mask)
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
@@ -614,7 +614,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 static void nvmet_execute_set_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -675,7 +675,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
 static void nvmet_execute_get_features(struct nvmet_req *req)
 {
 	struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 status = 0;
 
 	switch (cdw10 & 0xff) {
@@ -715,7 +715,7 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		break;
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
-		if (!(req->cmd->common.cdw10[1] & cpu_to_le32(1 << 0))) {
+		if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
 			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 			break;
 		}
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 4d8757ae8210..e1bb254671de 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -247,7 +247,7 @@ out:
 
 static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 {
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 stat;
 
 	switch (cdw10 & 0xff) {
@@ -268,7 +268,7 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 
 static void nvmet_execute_disc_get_features(struct nvmet_req *req)
 {
-	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]);
+	u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
 	u16 stat = 0;
 
 	switch (cdw10 & 0xff) {
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 89df51ee5bdf..dafee1af4829 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -349,7 +349,7 @@ struct nvmet_async_event {
 
 static inline void nvmet_clear_aen_bit(struct nvmet_req *req, u32 bn)
 {
-	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;
+	int rae = le32_to_cpu(req->cmd->common.cdw10) & 1 << 15;
 
 	if (!rae)
 		clear_bit(bn, &req->sq->ctrl->aen_masked);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4d7907e3771e..b94fe8fadc4f 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -662,7 +662,12 @@ struct nvme_common_command {
 	__le32			cdw2[2];
 	__le64			metadata;
 	union nvme_data_ptr	dptr;
-	__le32			cdw10[6];
+	__le32			cdw10;
+	__le32			cdw11;
+	__le32			cdw12;
+	__le32			cdw13;
+	__le32			cdw14;
+	__le32			cdw15;
 };
 
 struct nvme_rw_command {

From b34de7cee0a65f2557bb05447fbe2cc7a9c46750 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:38 -0800
Subject: [PATCH 298/351] nvme: add error log page slot definition

This patch adds the NVMe error slot definition from the spec.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/nvme.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index b94fe8fadc4f..bbcc83886899 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1168,6 +1168,20 @@ struct nvme_command {
 	};
 };
 
+struct nvme_error_slot {
+	__le64		error_count;
+	__le16		sqid;
+	__le16		cmdid;
+	__le16		status_field;
+	__le16		param_error_location;
+	__le64		lba;
+	__le32		nsid;
+	__u8		vs;
+	__u8		resv[3];
+	__le64		cs;
+	__u8		resv2[24];
+};
+
 static inline bool nvme_is_write(struct nvme_command *cmd)
 {
 	/*

From e4a976254ec5ebdcdb3e1e1b40e795b3db6b6dab Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:39 -0800
Subject: [PATCH 299/351] nvmet: add error-log definitions

This patch adds necessary fields in the target data structures to
support error log page. For a target controller, we add a new error log
field to maintain the error log, at any given point we maintain error
entries equal to NVMET_ERROR_LOG_SLOTS for each controller. In the
following patch, we also update the error log page entry in the I/O
completion path so we introduce a spinlock for synchronization of the
log.

For nvmet_req, we add a new field error_loc to hold the location of
the error in the command when the actual error occurs for each request
and a starting LBA if applicable.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 5 +++++
 drivers/nvme/target/nvmet.h | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e468100b9211..32dca1b50c21 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -769,6 +769,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	req->rsp->status = 0;
 	req->rsp->sq_head = 0;
 	req->ns = NULL;
+	req->error_loc = -1;
+	req->error_slba = 0;
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -1174,6 +1176,9 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
 	/* keep-alive timeout in seconds */
 	ctrl->kato = DIV_ROUND_UP(kato, 1000);
 
+	ctrl->err_counter = 0;
+	spin_lock_init(&ctrl->error_lock);
+
 	nvmet_start_keep_alive_timer(ctrl);
 
 	mutex_lock(&subsys->lock);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index dafee1af4829..61b3cc415905 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -202,6 +202,10 @@ struct nvmet_ctrl {
 
 	struct device		*p2p_client;
 	struct radix_tree_root	p2p_ns_map;
+
+	spinlock_t		error_lock;
+	u64			err_counter;
+	struct nvme_error_slot	slots[NVMET_ERROR_LOG_SLOTS];
 };
 
 struct nvmet_subsys {
@@ -317,6 +321,8 @@ struct nvmet_req {
 
 	struct pci_dev		*p2p_dev;
 	struct device		*p2p_client;
+	u16			error_loc;
+	u64			error_slba;
 };
 
 extern struct workqueue_struct *buffered_io_wq;

From 76574f37bf4caa7150ea1559cd98e3017b9752d2 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:40 -0800
Subject: [PATCH 300/351] nvmet: add interface to update error-log page

This patch adds nvmet_req based interface to the nvmet-core so that
we can update the error log page. We update error log page in
the request completion path when status is not set to NVME_SC_SUCCESS.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c  | 32 +++++++++++++++++++++++++++++++-
 drivers/nvme/target/nvmet.h |  5 -----
 2 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 32dca1b50c21..c7753ecdaaa5 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -611,14 +611,44 @@ static void nvmet_update_sq_head(struct nvmet_req *req)
 	req->rsp->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
 }
 
+static void nvmet_set_error(struct nvmet_req *req, u16 status)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	struct nvme_error_slot *new_error_slot;
+	unsigned long flags;
+
+	req->rsp->status = cpu_to_le16(status << 1);
+
+	if (!ctrl || req->error_loc == -1)
+		return;
+
+	spin_lock_irqsave(&ctrl->error_lock, flags);
+	ctrl->err_counter++;
+	new_error_slot =
+		&ctrl->slots[ctrl->err_counter % NVMET_ERROR_LOG_SLOTS];
+
+	new_error_slot->error_count = cpu_to_le64(ctrl->err_counter);
+	new_error_slot->sqid = cpu_to_le16(req->sq->qid);
+	new_error_slot->cmdid = cpu_to_le16(req->cmd->common.command_id);
+	new_error_slot->status_field = cpu_to_le16(status << 1);
+	new_error_slot->param_error_location = cpu_to_le16(req->error_loc);
+	new_error_slot->lba = cpu_to_le64(req->error_slba);
+	new_error_slot->nsid = req->cmd->common.nsid;
+	spin_unlock_irqrestore(&ctrl->error_lock, flags);
+
+	/* set the more bit for this request */
+	req->rsp->status |= cpu_to_le16(1 << 14);
+}
+
 static void __nvmet_req_complete(struct nvmet_req *req, u16 status)
 {
 	if (!req->sq->sqhd_disabled)
 		nvmet_update_sq_head(req);
 	req->rsp->sq_id = cpu_to_le16(req->sq->qid);
 	req->rsp->command_id = req->cmd->common.command_id;
+
 	if (unlikely(status))
-		nvmet_set_status(req, status);
+		nvmet_set_error(req, status);
 	if (req->ns)
 		nvmet_put_namespace(req->ns);
 	req->ops->queue_response(req);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 61b3cc415905..a2a7fd69b66a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -327,11 +327,6 @@ struct nvmet_req {
 
 extern struct workqueue_struct *buffered_io_wq;
 
-static inline void nvmet_set_status(struct nvmet_req *req, u16 status)
-{
-	req->rsp->status = cpu_to_le16(status << 1);
-}
-
 static inline void nvmet_set_result(struct nvmet_req *req, u32 result)
 {
 	req->rsp->result.u32 = cpu_to_le32(result);

From e81446afc7f5ee45fd90b1b5a22ffa247dca5b15 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:41 -0800
Subject: [PATCH 301/351] nvmet: add error log support in the core

This patch adds the support to maintain error log page for the
nvmet-core.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index c7753ecdaaa5..0b969977e361 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -51,22 +51,28 @@ static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf,
 		size_t len)
 {
-	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+	if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
 u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len)
 {
-	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len)
+	if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
 u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
 {
-	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len)
+	if (sg_zero_buffer(req->sg, req->sg_cnt, len, off) != len) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR;
+	}
 	return 0;
 }
 
@@ -769,14 +775,20 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
 		return ret;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
-	if (unlikely(!req->ns))
+	if (unlikely(!req->ns)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
+	}
 	ret = nvmet_check_ana_state(req->port, req->ns);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return ret;
+	}
 	ret = nvmet_io_cmd_check_access(req);
-	if (unlikely(ret))
+	if (unlikely(ret)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return ret;
+	}
 
 	if (req->ns->file)
 		return nvmet_file_parse_io_cmd(req);
@@ -804,6 +816,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 
 	/* no support for fused commands yet */
 	if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
+		req->error_loc = offsetof(struct nvme_common_command, flags);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		goto fail;
 	}
@@ -814,6 +827,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 	 * byte aligned.
 	 */
 	if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF)) {
+		req->error_loc = offsetof(struct nvme_common_command, flags);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		goto fail;
 	}
@@ -859,9 +873,10 @@ EXPORT_SYMBOL_GPL(nvmet_req_uninit);
 
 void nvmet_req_execute(struct nvmet_req *req)
 {
-	if (unlikely(req->data_len != req->transfer_len))
+	if (unlikely(req->data_len != req->transfer_len)) {
+		req->error_loc = offsetof(struct nvme_common_command, dptr);
 		nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR);
-	else
+	} else
 		req->execute(req);
 }
 EXPORT_SYMBOL_GPL(nvmet_req_execute);

From 84faf42b8aff3be95d96249da4152c77d81e1469 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:44 -0800
Subject: [PATCH 302/351] nvmet: add error log support for fabrics-cmd

This patch adds the support to maintain error log page for the fabrics
prop get, prop set, and admin connect commands. Here we also update the
discovery.c and add update set/get features and parse functions to
support error log page.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/discovery.c   | 10 +++++++
 drivers/nvme/target/fabrics-cmd.c | 48 ++++++++++++++++++++++---------
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index e1bb254671de..d2cb71a0b419 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -259,6 +259,8 @@ static void nvmet_execute_disc_set_features(struct nvmet_req *req)
 						  NVMET_DISC_AEN_CFG_OPTIONAL);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -279,6 +281,8 @@ static void nvmet_execute_disc_get_features(struct nvmet_req *req)
 		nvmet_get_feat_async_event(req);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		stat = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -293,6 +297,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 	if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) {
 		pr_err("got cmd %d while not ready\n",
 		       cmd->common.opcode);
+		req->error_loc =
+			offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
@@ -323,6 +329,8 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		default:
 			pr_err("unsupported get_log_page lid %d\n",
 			       cmd->get_log_page.lid);
+			req->error_loc =
+				offsetof(struct nvme_get_log_page_command, lid);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	case nvme_admin_identify:
@@ -335,10 +343,12 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req)
 		default:
 			pr_err("unsupported identify cns %d\n",
 			       cmd->identify.cns);
+			req->error_loc = offsetof(struct nvme_identify, cns);
 			return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 		}
 	default:
 		pr_err("unhandled cmd %d\n", cmd->common.opcode);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index ee7d84621d65..6cf1fd9eb32e 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -17,23 +17,26 @@
 
 static void nvmet_execute_prop_set(struct nvmet_req *req)
 {
+	u64 val = le64_to_cpu(req->cmd->prop_set.value);
 	u16 status = 0;
 
-	if (!(req->cmd->prop_set.attrib & 1)) {
-		u64 val = le64_to_cpu(req->cmd->prop_set.value);
-
-		switch (le32_to_cpu(req->cmd->prop_set.offset)) {
-		case NVME_REG_CC:
-			nvmet_update_cc(req->sq->ctrl, val);
-			break;
-		default:
-			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
-			break;
-		}
-	} else {
+	if (req->cmd->prop_set.attrib & 1) {
+		req->error_loc =
+			offsetof(struct nvmf_property_set_command, attrib);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+		goto out;
 	}
 
+	switch (le32_to_cpu(req->cmd->prop_set.offset)) {
+	case NVME_REG_CC:
+		nvmet_update_cc(req->sq->ctrl, val);
+		break;
+	default:
+		req->error_loc =
+			offsetof(struct nvmf_property_set_command, offset);
+		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
+out:
 	nvmet_req_complete(req, status);
 }
 
@@ -69,6 +72,14 @@ static void nvmet_execute_prop_get(struct nvmet_req *req)
 		}
 	}
 
+	if (status && req->cmd->prop_get.attrib & 1) {
+		req->error_loc =
+			offsetof(struct nvmf_property_get_command, offset);
+	} else {
+		req->error_loc =
+			offsetof(struct nvmf_property_get_command, attrib);
+	}
+
 	req->rsp->result.u64 = cpu_to_le64(val);
 	nvmet_req_complete(req, status);
 }
@@ -89,6 +100,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req)
 	default:
 		pr_err("received unknown capsule type 0x%x\n",
 			cmd->fabrics.fctype);
+		req->error_loc = offsetof(struct nvmf_common_command, fctype);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 
@@ -105,10 +117,12 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
 	old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
 	if (old) {
 		pr_warn("queue already connected!\n");
+		req->error_loc = offsetof(struct nvmf_connect_command, opcode);
 		return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR;
 	}
 	if (!sqsize) {
 		pr_warn("queue size zero!\n");
+		req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
 		return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
 	}
 
@@ -157,6 +171,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 	if (c->recfmt != 0) {
 		pr_warn("invalid connect version (%d).\n",
 			le16_to_cpu(c->recfmt));
+		req->error_loc = offsetof(struct nvmf_connect_command, recfmt);
 		status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR;
 		goto out;
 	}
@@ -171,8 +186,13 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
 
 	status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req,
 				  le32_to_cpu(c->kato), &ctrl);
-	if (status)
+	if (status) {
+		if (status == (NVME_SC_INVALID_FIELD | NVME_SC_DNR))
+			req->error_loc =
+				offsetof(struct nvme_common_command, opcode);
 		goto out;
+	}
+
 	uuid_copy(&ctrl->hostid, &d->hostid);
 
 	status = nvmet_install_queue(ctrl, req);
@@ -259,11 +279,13 @@ u16 nvmet_parse_connect_cmd(struct nvmet_req *req)
 	if (cmd->common.opcode != nvme_fabrics_command) {
 		pr_err("invalid command 0x%x on unconnected queue.\n",
 			cmd->fabrics.opcode);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 	if (cmd->fabrics.fctype != nvme_fabrics_type_connect) {
 		pr_err("invalid capsule type 0x%x on unconnected queue.\n",
 			cmd->fabrics.fctype);
+		req->error_loc = offsetof(struct nvmf_common_command, fctype);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 

From 762a11dfee10783a2fe4c467a68bac601e5acf1c Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:45 -0800
Subject: [PATCH 303/351] nvmet: add error log support for rdma backend

This patch adds the support to maintain the error log page for rdma
transport, we mainly focus here on the NVME_INVALID_FIELD errors.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/rdma.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index fb84caddd94b..a8d23eb80192 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -630,8 +630,11 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 	u64 off = le64_to_cpu(sgl->addr);
 	u32 len = le32_to_cpu(sgl->length);
 
-	if (!nvme_is_write(rsp->req.cmd))
+	if (!nvme_is_write(rsp->req.cmd)) {
+		rsp->req.error_loc =
+			offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
 
 	if (off + len > rsp->queue->dev->inline_data_size) {
 		pr_err("invalid inline data offset!\n");
@@ -696,6 +699,8 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 			return nvmet_rdma_map_sgl_inline(rsp);
 		default:
 			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			rsp->req.error_loc =
+				offsetof(struct nvme_common_command, dptr);
 			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		}
 	case NVME_KEY_SGL_FMT_DATA_DESC:
@@ -706,10 +711,13 @@ static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 			return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 		default:
 			pr_err("invalid SGL subtype: %#x\n", sgl->type);
+			rsp->req.error_loc =
+				offsetof(struct nvme_common_command, dptr);
 			return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		}
 	default:
 		pr_err("invalid SGL type: %#x\n", sgl->type);
+		rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
 		return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 	}
 }

From 2da6e00580f5bc13ed0ba0acaa9d7ce0df226e7e Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:46 -0800
Subject: [PATCH 304/351] nvmet: add error log support for admin-cmd

This patch adds the support to maintain the error log page for admin
commands.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 721b041a6b3b..fa62db7a5e9e 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -47,6 +47,7 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 	if (!ns) {
 		pr_err("Could not find namespace id : %d\n",
 				le32_to_cpu(req->cmd->get_log_page.nsid));
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
 		return NVME_SC_INVALID_NS;
 	}
 
@@ -380,6 +381,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
 	u16 status = 0;
 
 	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
 		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 		goto out;
 	}
@@ -500,6 +502,7 @@ static void nvmet_execute_identify_desclist(struct nvmet_req *req)
 
 	ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
 	if (!ns) {
+		req->error_loc = offsetof(struct nvme_identify, nsid);
 		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
 		goto out;
 	}
@@ -562,8 +565,10 @@ static u16 nvmet_set_feat_write_protect(struct nvmet_req *req)
 	u16 status = NVME_SC_FEATURE_NOT_CHANGEABLE;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->rw.nsid);
-	if (unlikely(!req->ns))
+	if (unlikely(!req->ns)) {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return status;
+	}
 
 	mutex_lock(&subsys->lock);
 	switch (write_protect) {
@@ -602,8 +607,10 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
 {
 	u32 val32 = le32_to_cpu(req->cmd->common.cdw11);
 
-	if (val32 & ~mask)
+	if (val32 & ~mask) {
+		req->error_loc = offsetof(struct nvme_common_command, cdw11);
 		return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
+	}
 
 	WRITE_ONCE(req->sq->ctrl->aen_enabled, val32);
 	nvmet_set_result(req, val32);
@@ -635,6 +642,7 @@ static void nvmet_execute_set_features(struct nvmet_req *req)
 		status = nvmet_set_feat_write_protect(req);
 		break;
 	default:
+		req->error_loc = offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -648,9 +656,10 @@ static u16 nvmet_get_feat_write_protect(struct nvmet_req *req)
 	u32 result;
 
 	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->common.nsid);
-	if (!req->ns)
+	if (!req->ns)  {
+		req->error_loc = offsetof(struct nvme_common_command, nsid);
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
-
+	}
 	mutex_lock(&subsys->lock);
 	if (req->ns->readonly == true)
 		result = NVME_NS_WRITE_PROTECT;
@@ -716,6 +725,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 	case NVME_FEAT_HOST_ID:
 		/* need 128-bit host identifier flag */
 		if (!(req->cmd->common.cdw11 & cpu_to_le32(1 << 0))) {
+			req->error_loc =
+				offsetof(struct nvme_common_command, cdw11);
 			status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 			break;
 		}
@@ -727,6 +738,8 @@ static void nvmet_execute_get_features(struct nvmet_req *req)
 		status = nvmet_get_feat_write_protect(req);
 		break;
 	default:
+		req->error_loc =
+			offsetof(struct nvme_common_command, cdw10);
 		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 		break;
 	}
@@ -848,5 +861,6 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 	pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
 	       req->sq->qid);
+	req->error_loc = offsetof(struct nvme_common_command, opcode);
 	return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 }

From 3b031d15995f3e4c43e38159072f07efe3fa35d9 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:42 -0800
Subject: [PATCH 305/351] nvmet: add error log support for bdev backend

This patch adds the support for the block device backend to populate the
error log entries. Here we map the blk_status_t to the NVMe status.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/io-cmd-bdev.c | 84 ++++++++++++++++++++++++++-----
 1 file changed, 72 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index e0831230ca27..b6d030d3259f 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -44,13 +44,69 @@ void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
 	}
 }
 
+static u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
+{
+	u16 status = NVME_SC_SUCCESS;
+
+	if (likely(blk_sts == BLK_STS_OK))
+		return status;
+	/*
+	 * Right now there exists M : 1 mapping between block layer error
+	 * to the NVMe status code (see nvme_error_status()). For consistency,
+	 * when we reverse map we use most appropriate NVMe Status code from
+	 * the group of the NVMe staus codes used in the nvme_error_status().
+	 */
+	switch (blk_sts) {
+	case BLK_STS_NOSPC:
+		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_rw_command, length);
+		break;
+	case BLK_STS_TARGET:
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		break;
+	case BLK_STS_NOTSUPP:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		switch (req->cmd->common.opcode) {
+		case nvme_cmd_dsm:
+		case nvme_cmd_write_zeroes:
+			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+			break;
+		default:
+			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+		break;
+	case BLK_STS_MEDIUM:
+		status = NVME_SC_ACCESS_DENIED;
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
+		break;
+	case BLK_STS_IOERR:
+		/* fallthru */
+	default:
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+	}
+
+	switch (req->cmd->common.opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+		req->error_slba = le64_to_cpu(req->cmd->rw.slba);
+		break;
+	case nvme_cmd_write_zeroes:
+		req->error_slba =
+			le64_to_cpu(req->cmd->write_zeroes.slba);
+		break;
+	default:
+		req->error_slba = 0;
+	}
+	return status;
+}
+
 static void nvmet_bio_done(struct bio *bio)
 {
 	struct nvmet_req *req = bio->bi_private;
 
-	nvmet_req_complete(req,
-		bio->bi_status ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
-
+	nvmet_req_complete(req, blk_to_nvme_status(req, bio->bi_status));
 	if (bio != &req->b.inline_bio)
 		bio_put(bio);
 }
@@ -136,18 +192,21 @@ u16 nvmet_bdev_flush(struct nvmet_req *req)
 	return 0;
 }
 
-static u16 nvmet_bdev_discard_range(struct nvmet_ns *ns,
+static u16 nvmet_bdev_discard_range(struct nvmet_req *req,
 		struct nvme_dsm_range *range, struct bio **bio)
 {
+	struct nvmet_ns *ns = req->ns;
 	int ret;
 
 	ret = __blkdev_issue_discard(ns->bdev,
 			le64_to_cpu(range->slba) << (ns->blksize_shift - 9),
 			le32_to_cpu(range->nlb) << (ns->blksize_shift - 9),
 			GFP_KERNEL, 0, bio);
-	if (ret && ret != -EOPNOTSUPP)
-		return NVME_SC_INTERNAL | NVME_SC_DNR;
-	return 0;
+
+	if (ret)
+		req->error_slba = le64_to_cpu(range->slba);
+
+	return blk_to_nvme_status(req, errno_to_blk_status(ret));
 }
 
 static void nvmet_bdev_execute_discard(struct nvmet_req *req)
@@ -163,7 +222,7 @@ static void nvmet_bdev_execute_discard(struct nvmet_req *req)
 		if (status)
 			break;
 
-		status = nvmet_bdev_discard_range(req->ns, &range, &bio);
+		status = nvmet_bdev_discard_range(req, &range, &bio);
 		if (status)
 			break;
 	}
@@ -204,16 +263,16 @@ static void nvmet_bdev_execute_write_zeroes(struct nvmet_req *req)
 	u16 status = NVME_SC_SUCCESS;
 	sector_t sector;
 	sector_t nr_sector;
+	int ret;
 
 	sector = le64_to_cpu(write_zeroes->slba) <<
 		(req->ns->blksize_shift - 9);
 	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
 		(req->ns->blksize_shift - 9));
 
-	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
-				GFP_KERNEL, &bio, 0))
-		status = NVME_SC_INTERNAL | NVME_SC_DNR;
-
+	ret = __blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
+			GFP_KERNEL, &bio, 0);
+	status = blk_to_nvme_status(req, errno_to_blk_status(ret));
 	if (bio) {
 		bio->bi_private = req;
 		bio->bi_end_io = nvmet_bio_done;
@@ -248,6 +307,7 @@ u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req)
 	default:
 		pr_err("unhandled cmd %d on qid %d\n", cmd->common.opcode,
 		       req->sq->qid);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 }

From c6aa3542e01026a94d24713ee2c0dce517e9b6de Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:43 -0800
Subject: [PATCH 306/351] nvmet: add error log support for file backend

This patch adds support for the file backend to populate the
error log entries. Here we map the errno to the NVMe status codes.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/core.c        | 38 +++++++++++++++++++++++++++++++
 drivers/nvme/target/io-cmd-file.c | 35 ++++++++++++++++------------
 drivers/nvme/target/nvmet.h       |  2 ++
 3 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 0b969977e361..cc81d0231587 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -45,6 +45,44 @@ u32 nvmet_ana_group_enabled[NVMET_MAX_ANAGRPS + 1];
 u64 nvmet_ana_chgcnt;
 DECLARE_RWSEM(nvmet_ana_sem);
 
+inline u16 errno_to_nvme_status(struct nvmet_req *req, int errno)
+{
+	u16 status;
+
+	switch (errno) {
+	case -ENOSPC:
+		req->error_loc = offsetof(struct nvme_rw_command, length);
+		status = NVME_SC_CAP_EXCEEDED | NVME_SC_DNR;
+		break;
+	case -EREMOTEIO:
+		req->error_loc = offsetof(struct nvme_rw_command, slba);
+		status = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+		break;
+	case -EOPNOTSUPP:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		switch (req->cmd->common.opcode) {
+		case nvme_cmd_dsm:
+		case nvme_cmd_write_zeroes:
+			status = NVME_SC_ONCS_NOT_SUPPORTED | NVME_SC_DNR;
+			break;
+		default:
+			status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
+		}
+		break;
+	case -ENODATA:
+		req->error_loc = offsetof(struct nvme_rw_command, nsid);
+		status = NVME_SC_ACCESS_DENIED;
+		break;
+	case -EIO:
+		/* FALLTHRU */
+	default:
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
+		status = NVME_SC_INTERNAL | NVME_SC_DNR;
+	}
+
+	return status;
+}
+
 static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port,
 		const char *subsysnqn);
 
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 12eaa8ddc248..517522305e5c 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -112,6 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
 static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 {
 	struct nvmet_req *req = container_of(iocb, struct nvmet_req, f.iocb);
+	u16 status = NVME_SC_SUCCESS;
 
 	if (req->f.bvec != req->inline_bvec) {
 		if (likely(req->f.mpool_alloc == false))
@@ -120,8 +121,9 @@ static void nvmet_file_io_done(struct kiocb *iocb, long ret, long ret2)
 			mempool_free(req->f.bvec, req->ns->bvec_pool);
 	}
 
-	nvmet_req_complete(req, ret != req->data_len ?
-			NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+	if (unlikely(ret != req->data_len))
+		status = errno_to_nvme_status(req, ret);
+	nvmet_req_complete(req, status);
 }
 
 static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
@@ -140,7 +142,7 @@ static bool nvmet_file_execute_io(struct nvmet_req *req, int ki_flags)
 
 	pos = le64_to_cpu(req->cmd->rw.slba) << req->ns->blksize_shift;
 	if (unlikely(pos + req->data_len > req->ns->size)) {
-		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+		nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
 		return true;
 	}
 
@@ -254,9 +256,7 @@ static void nvmet_file_execute_rw(struct nvmet_req *req)
 
 u16 nvmet_file_flush(struct nvmet_req *req)
 {
-	if (vfs_fsync(req->ns->file, 1) < 0)
-		return NVME_SC_INTERNAL | NVME_SC_DNR;
-	return 0;
+	return errno_to_nvme_status(req, vfs_fsync(req->ns->file, 1));
 }
 
 static void nvmet_file_flush_work(struct work_struct *w)
@@ -277,30 +277,34 @@ static void nvmet_file_execute_discard(struct nvmet_req *req)
 	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
 	struct nvme_dsm_range range;
 	loff_t offset, len;
-	u16 ret;
+	u16 status = 0;
+	int ret;
 	int i;
 
 	for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) {
-		ret = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
+		status = nvmet_copy_from_sgl(req, i * sizeof(range), &range,
 					sizeof(range));
-		if (ret)
+		if (status)
 			break;
 
 		offset = le64_to_cpu(range.slba) << req->ns->blksize_shift;
 		len = le32_to_cpu(range.nlb);
 		len <<= req->ns->blksize_shift;
 		if (offset + len > req->ns->size) {
-			ret = NVME_SC_LBA_RANGE | NVME_SC_DNR;
+			req->error_slba = le64_to_cpu(range.slba);
+			status = errno_to_nvme_status(req, -ENOSPC);
 			break;
 		}
 
-		if (vfs_fallocate(req->ns->file, mode, offset, len)) {
-			ret = NVME_SC_INTERNAL | NVME_SC_DNR;
+		ret = vfs_fallocate(req->ns->file, mode, offset, len);
+		if (ret) {
+			req->error_slba = le64_to_cpu(range.slba);
+			status = errno_to_nvme_status(req, ret);
 			break;
 		}
 	}
 
-	nvmet_req_complete(req, ret);
+	nvmet_req_complete(req, status);
 }
 
 static void nvmet_file_dsm_work(struct work_struct *w)
@@ -340,12 +344,12 @@ static void nvmet_file_write_zeroes_work(struct work_struct *w)
 			req->ns->blksize_shift);
 
 	if (unlikely(offset + len > req->ns->size)) {
-		nvmet_req_complete(req, NVME_SC_LBA_RANGE | NVME_SC_DNR);
+		nvmet_req_complete(req, errno_to_nvme_status(req, -ENOSPC));
 		return;
 	}
 
 	ret = vfs_fallocate(req->ns->file, mode, offset, len);
-	nvmet_req_complete(req, ret < 0 ? NVME_SC_INTERNAL | NVME_SC_DNR : 0);
+	nvmet_req_complete(req, ret < 0 ? errno_to_nvme_status(req, ret) : 0);
 }
 
 static void nvmet_file_execute_write_zeroes(struct nvmet_req *req)
@@ -380,6 +384,7 @@ u16 nvmet_file_parse_io_cmd(struct nvmet_req *req)
 	default:
 		pr_err("unhandled cmd for file ns %d on qid %d\n",
 				cmd->common.opcode, req->sq->qid);
+		req->error_loc = offsetof(struct nvme_common_command, opcode);
 		return NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
 	}
 }
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index a2a7fd69b66a..3b5f0bcaf3e8 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -494,4 +494,6 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req)
 	return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) <<
 			req->ns->blksize_shift;
 }
+
+u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
 #endif /* _NVMET_H */

From 11ad507784ed5113af97903af1b0c4aea6b90690 Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:47 -0800
Subject: [PATCH 307/351] nvmet: add error log page cmd handler

Now that we have support for all the major parts of the target we add
a NVMe error log page handler so that host can read the log page.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 36 ++++++++++++++++++++++++++-------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index fa62db7a5e9e..00956b4106a8 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -37,6 +37,34 @@ static void nvmet_execute_get_log_page_noop(struct nvmet_req *req)
 	nvmet_req_complete(req, nvmet_zero_sgl(req, 0, req->data_len));
 }
 
+static void nvmet_execute_get_log_page_error(struct nvmet_req *req)
+{
+	struct nvmet_ctrl *ctrl = req->sq->ctrl;
+	u16 status = NVME_SC_SUCCESS;
+	unsigned long flags;
+	off_t offset = 0;
+	u64 slot;
+	u64 i;
+
+	spin_lock_irqsave(&ctrl->error_lock, flags);
+	slot = ctrl->err_counter % NVMET_ERROR_LOG_SLOTS;
+
+	for (i = 0; i < NVMET_ERROR_LOG_SLOTS; i++) {
+		status = nvmet_copy_to_sgl(req, offset, &ctrl->slots[slot],
+				sizeof(struct nvme_error_slot));
+		if (status)
+			break;
+
+		if (slot == 0)
+			slot = NVMET_ERROR_LOG_SLOTS - 1;
+		else
+			slot--;
+		offset += sizeof(struct nvme_error_slot);
+	}
+	spin_unlock_irqrestore(&ctrl->error_lock, flags);
+	nvmet_req_complete(req, status);
+}
+
 static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req,
 		struct nvme_smart_log *slog)
 {
@@ -789,13 +817,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
 
 		switch (cmd->get_log_page.lid) {
 		case NVME_LOG_ERROR:
-			/*
-			 * We currently never set the More bit in the status
-			 * field, so all error log entries are invalid and can
-			 * be zeroed out.  This is called a minum viable
-			 * implementation (TM) of this mandatory log page.
-			 */
-			req->execute = nvmet_execute_get_log_page_noop;
+			req->execute = nvmet_execute_get_log_page_error;
 			return 0;
 		case NVME_LOG_SMART:
 			req->execute = nvmet_execute_get_log_page_smart;

From 23454d59cc16ddddf4b2290bbe60d2d9581dfd9a Mon Sep 17 00:00:00 2001
From: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Date: Wed, 12 Dec 2018 15:11:48 -0800
Subject: [PATCH 308/351] nvmet: update smart log with num err log entries

Now that we have error log page implementation update smart log command
handler to provide number of error log entries in the lifetime of the
controller field.

Signed-off-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/target/admin-cmd.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 00956b4106a8..11baeb14c388 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -135,6 +135,7 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 {
 	struct nvme_smart_log *log;
 	u16 status = NVME_SC_INTERNAL;
+	unsigned long flags;
 
 	if (req->data_len != sizeof(*log))
 		goto out;
@@ -150,6 +151,11 @@ static void nvmet_execute_get_log_page_smart(struct nvmet_req *req)
 	if (status)
 		goto out_free_log;
 
+	spin_lock_irqsave(&req->sq->ctrl->error_lock, flags);
+	put_unaligned_le64(req->sq->ctrl->err_counter,
+			&log->num_err_log_entries);
+	spin_unlock_irqrestore(&req->sq->ctrl->error_lock, flags);
+
 	status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log));
 out_free_log:
 	kfree(log);

From e42b3867de4bd5ee3a1849afb68a1fa8627f7282 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:54 -0800
Subject: [PATCH 309/351] blk-mq-rdma: pass in queue map to
 blk_mq_rdma_map_queues

Will be used by nvme-rdma for queue map separation support.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-rdma.c         | 8 ++++----
 drivers/nvme/host/rdma.c    | 2 +-
 include/linux/blk-mq-rdma.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index a71576aff3a5..45030a81a1ed 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -29,24 +29,24 @@
  * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
  * vector, we fallback to the naive mapping.
  */
-int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
 		struct ib_device *dev, int first_vec)
 {
 	const struct cpumask *mask;
 	unsigned int queue, cpu;
 
-	for (queue = 0; queue < set->nr_hw_queues; queue++) {
+	for (queue = 0; queue < map->nr_queues; queue++) {
 		mask = ib_get_vector_affinity(dev, first_vec + queue);
 		if (!mask)
 			goto fallback;
 
 		for_each_cpu(cpu, mask)
-			set->map[0].mq_map[cpu] = queue;
+			map->mq_map[cpu] = map->queue_offset + queue;
 	}
 
 	return 0;
 
 fallback:
-	return blk_mq_map_queues(&set->map[0]);
+	return blk_mq_map_queues(map);
 }
 EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index f2db848f6985..5057d5ab5aaa 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1751,7 +1751,7 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = set->driver_data;
 
-	return blk_mq_rdma_map_queues(set, ctrl->device->dev, 0);
+	return blk_mq_rdma_map_queues(&set->map[0], ctrl->device->dev, 0);
 }
 
 static const struct blk_mq_ops nvme_rdma_mq_ops = {
diff --git a/include/linux/blk-mq-rdma.h b/include/linux/blk-mq-rdma.h
index b4ade198007d..7b6ecf9ac4c3 100644
--- a/include/linux/blk-mq-rdma.h
+++ b/include/linux/blk-mq-rdma.h
@@ -4,7 +4,7 @@
 struct blk_mq_tag_set;
 struct ib_device;
 
-int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
+int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
 		struct ib_device *dev, int first_vec);
 
 #endif /* _LINUX_BLK_MQ_RDMA_H */

From fa9a1811e094658e53b0c82b6ce0431c4c54fc1b Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:55 -0800
Subject: [PATCH 310/351] nvme-fabrics: add missing nvmf_ctrl_options
 documentation

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 524a02a67817..28dc916ef26b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -88,6 +88,9 @@ enum {
  * @max_reconnects: maximum number of allowed reconnect attempts before removing
  *              the controller, (-1) means reconnect forever, zero means remove
  *              immediately;
+ * @disable_sqflow: disable controller sq flow control
+ * @hdr_digest: generate/verify header digest (TCP)
+ * @data_digest: generate/verify data digest (TCP)
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;

From 330f6b8a70771f0b059b7bcbc9a28d8023235b55 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:56 -0800
Subject: [PATCH 311/351] nvme-fabrics: allow user to set nr_write_queues for
 separate queue maps

This argument will specify how many I/O queues will be connected in
create_ctrl in addition to nr_io_queues. With this configuration, I/O
that carries payload from the host to the target, will use the default
hctx queue map, and I/O that involves target to host transfers will use
the read hctx queue map.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/fabrics.c | 13 +++++++++++++
 drivers/nvme/host/fabrics.h |  3 +++
 2 files changed, 16 insertions(+)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 9c62c6838b76..19ff0eae4582 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -616,6 +616,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_DISABLE_SQFLOW,	"disable_sqflow"	},
 	{ NVMF_OPT_HDR_DIGEST,		"hdr_digest"		},
 	{ NVMF_OPT_DATA_DIGEST,		"data_digest"		},
+	{ NVMF_OPT_NR_WRITE_QUEUES,	"nr_write_queues=%d"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -837,6 +838,18 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 		case NVMF_OPT_DATA_DIGEST:
 			opts->data_digest = true;
 			break;
+		case NVMF_OPT_NR_WRITE_QUEUES:
+			if (match_int(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (token <= 0) {
+				pr_err("Invalid nr_write_queues %d\n", token);
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->nr_write_queues = token;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 28dc916ef26b..81b8fd1c0c5d 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -61,6 +61,7 @@ enum {
 	NVMF_OPT_DISABLE_SQFLOW = 1 << 14,
 	NVMF_OPT_HDR_DIGEST	= 1 << 15,
 	NVMF_OPT_DATA_DIGEST	= 1 << 16,
+	NVMF_OPT_NR_WRITE_QUEUES = 1 << 17,
 };
 
 /**
@@ -91,6 +92,7 @@ enum {
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
+ * @nr_write_queues: number of queues for write I/O
  */
 struct nvmf_ctrl_options {
 	unsigned		mask;
@@ -110,6 +112,7 @@ struct nvmf_ctrl_options {
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
+	unsigned int		nr_write_queues;
 };
 
 /*

From 873946f4b957bf6e6a0351dd5282c73e0c870be9 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:57 -0800
Subject: [PATCH 312/351] nvme-tcp: support separate queue maps for read and
 write

Allow NVMF_OPT_NR_WRITE_QUEUES to describe additional write queues.  In
addition, implement .map_queues that will apply 2 queue maps for read
and write queue sets.

Note that with the separate queue map, HCTX_TYPE_READ will always use
nr_io_queues and HCTX_TYPE_DEFAULT will use nr_write_queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 47 +++++++++++++++++++++++++++++++++++------
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 15543358e245..51826479a41e 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1215,7 +1215,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	struct linger sol = { .l_onoff = 1, .l_linger = 0 };
-	int ret, opt, rcv_pdu_size;
+	int ret, opt, rcv_pdu_size, n;
 
 	queue->ctrl = ctrl;
 	INIT_LIST_HEAD(&queue->send_list);
@@ -1271,7 +1271,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
 	}
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
-	queue->io_cpu = (qid == 0) ? 0 : qid - 1;
+	if (!qid)
+		n = 0;
+	else
+		n = (qid - 1) % num_online_cpus();
+	queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -1433,6 +1437,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
+		set->nr_maps = 2 /* default + read */;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -1527,7 +1532,12 @@ out_free_queues:
 
 static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
 {
-	return min(ctrl->queue_count - 1, num_online_cpus());
+	unsigned int nr_io_queues;
+
+	nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
+	nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
+
+	return nr_io_queues;
 }
 
 static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl)
@@ -2052,6 +2062,29 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_STS_OK;
 }
 
+static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nvme_tcp_ctrl *ctrl = set->driver_data;
+
+	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+	set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+	if (ctrl->ctrl.opts->nr_write_queues) {
+		/* separate read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_write_queues;
+		set->map[HCTX_TYPE_READ].queue_offset =
+				ctrl->ctrl.opts->nr_write_queues;
+	} else {
+		/* mixed read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_io_queues;
+		set->map[HCTX_TYPE_READ].queue_offset = 0;
+	}
+	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+	return 0;
+}
+
 static struct blk_mq_ops nvme_tcp_mq_ops = {
 	.queue_rq	= nvme_tcp_queue_rq,
 	.complete	= nvme_complete_rq,
@@ -2059,6 +2092,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = {
 	.exit_request	= nvme_tcp_exit_request,
 	.init_hctx	= nvme_tcp_init_hctx,
 	.timeout	= nvme_tcp_timeout,
+	.map_queues	= nvme_tcp_map_queues,
 };
 
 static struct blk_mq_ops nvme_tcp_admin_mq_ops = {
@@ -2113,7 +2147,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 
 	INIT_LIST_HEAD(&ctrl->list);
 	ctrl->ctrl.opts = opts;
-	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -2155,7 +2189,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 		goto out_free_ctrl;
 	}
 
-	ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues),
+	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
 				GFP_KERNEL);
 	if (!ctrl->queues) {
 		ret = -ENOMEM;
@@ -2206,7 +2240,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
 			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
-			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST,
+			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
+			  NVMF_OPT_NR_WRITE_QUEUES,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 

From b65bb777ef2237030f2802f2263ae9a0108f7acf Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Tue, 11 Dec 2018 23:38:58 -0800
Subject: [PATCH 313/351] nvme-rdma: support separate queue maps for read and
 write

llow NVMF_OPT_NR_WRITE_QUEUES to describe additional write queues.  In
addition, implement .map_queues that will apply 2 queue maps for read
and write queue sets.

Note that with the separate queue map, HCTX_TYPE_READ will always use
nr_io_queues and HCTX_TYPE_DEFAULT will use nr_write_queues.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/rdma.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 5057d5ab5aaa..ed726da77b5b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -645,6 +645,8 @@ static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 	nr_io_queues = min_t(unsigned int, nr_io_queues,
 				ibdev->num_comp_vectors);
 
+	nr_io_queues += min(opts->nr_write_queues, num_online_cpus());
+
 	ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 	if (ret)
 		return ret;
@@ -714,6 +716,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->driver_data = ctrl;
 		set->nr_hw_queues = nctrl->queue_count - 1;
 		set->timeout = NVME_IO_TIMEOUT;
+		set->nr_maps = 2 /* default + read */;
 	}
 
 	ret = blk_mq_alloc_tag_set(set);
@@ -1751,7 +1754,25 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_rdma_ctrl *ctrl = set->driver_data;
 
-	return blk_mq_rdma_map_queues(&set->map[0], ctrl->device->dev, 0);
+	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+	set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues;
+	if (ctrl->ctrl.opts->nr_write_queues) {
+		/* separate read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_write_queues;
+		set->map[HCTX_TYPE_READ].queue_offset =
+				ctrl->ctrl.opts->nr_write_queues;
+	} else {
+		/* mixed read/write queues */
+		set->map[HCTX_TYPE_DEFAULT].nr_queues =
+				ctrl->ctrl.opts->nr_io_queues;
+		set->map[HCTX_TYPE_READ].queue_offset = 0;
+	}
+	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
+			ctrl->device->dev, 0);
+	blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
+			ctrl->device->dev, 0);
+	return 0;
 }
 
 static const struct blk_mq_ops nvme_rdma_mq_ops = {
@@ -1906,7 +1927,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
 
-	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
+	ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1;
 	ctrl->ctrl.sqsize = opts->queue_size - 1;
 	ctrl->ctrl.kato = opts->kato;
 
@@ -1957,7 +1978,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = {
 	.module		= THIS_MODULE,
 	.required_opts	= NVMF_OPT_TRADDR,
 	.allowed_opts	= NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
-			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
+			  NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
+			  NVMF_OPT_NR_WRITE_QUEUES,
 	.create_ctrl	= nvme_rdma_create_ctrl,
 };
 

From d2f96f487f471b4c718324ecd1540c89d2be52ac Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:46 +0800
Subject: [PATCH 314/351] bcache: add comment for cache_set->fill_iter

We have the following define for btree iterator:
	struct btree_iter {
		size_t size, used;
	#ifdef CONFIG_BCACHE_DEBUG
		struct btree_keys *b;
	#endif
		struct btree_iter_set {
			struct bkey *k, *end;
		} data[MAX_BSETS];
	};

We can see that the length of data[] field is static MAX_BSETS, which is
defined as 4 currently.

But a btree node on disk could have too many bsets for an iterator to fit
on the stack - maybe far more that MAX_BSETS. Have to dynamically allocate
space to host more btree_iter_sets.

bch_cache_set_alloc() will make sure the pool cache_set->fill_iter can
allocate an iterator equipped with enough room that can host
	(sb.bucket_size / sb.block_size)
btree_iter_sets, which is more than static MAX_BSETS.

bch_btree_node_read_done() will use that pool to allocate one iterator, to
host many bsets in one btree node.

Add more comment around cache_set->fill_iter to make code less confusing.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h | 6 +++++-
 drivers/md/bcache/btree.c  | 5 +++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index b61b83bbcfff..96d2213f279e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -658,7 +658,11 @@ struct cache_set {
 
 	/*
 	 * A btree node on disk could have too many bsets for an iterator to fit
-	 * on the stack - have to dynamically allocate them
+	 * on the stack - have to dynamically allocate them.
+	 * bch_cache_set_alloc() will make sure the pool can allocate iterators
+	 * equipped with enough room that can host
+	 *     (sb.bucket_size / sb.block_size)
+	 * btree_iter_sets, which is more than static MAX_BSETS.
 	 */
 	mempool_t		fill_iter;
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3f4211b5cd33..23cb1dc7296b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -207,6 +207,11 @@ void bch_btree_node_read_done(struct btree *b)
 	struct bset *i = btree_bset_first(b);
 	struct btree_iter *iter;
 
+	/*
+	 * c->fill_iter can allocate an iterator with more memory space
+	 * than static MAX_BSETS.
+	 * See the comment arount cache_set->fill_iter.
+	 */
 	iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 	iter->used = 0;

From ae17102316550b4b230a283febe31b2a9ff30084 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:47 +0800
Subject: [PATCH 315/351] bcache: do not check if debug dentry is ERR or NULL
 explicitly on remove

debugfs_remove and debugfs_remove_recursive will check if the dentry
pointer is NULL or ERR, and will do nothing in that case.

Remove the check in cache_set_free and bch_debug_init.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/debug.c | 3 +--
 drivers/md/bcache/super.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index 8f448b9c96a1..8b123be05254 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -249,8 +249,7 @@ void bch_debug_init_cache_set(struct cache_set *c)
 
 void bch_debug_exit(void)
 {
-	if (!IS_ERR_OR_NULL(bcache_debug))
-		debugfs_remove_recursive(bcache_debug);
+	debugfs_remove_recursive(bcache_debug);
 }
 
 void __init bch_debug_init(void)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 7bbd670a5a84..5b59d44656c0 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1510,8 +1510,7 @@ static void cache_set_free(struct closure *cl)
 	struct cache *ca;
 	unsigned int i;
 
-	if (!IS_ERR_OR_NULL(c->debug))
-		debugfs_remove(c->debug);
+	debugfs_remove(c->debug);
 
 	bch_open_buckets_free(c);
 	bch_btree_cache_free(c);

From 3db4d0783eaf2a2537f6b83679ad57cba0ad0481 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:48 +0800
Subject: [PATCH 316/351] bcache: update comment for bch_data_insert

commit 220bb38c21b8 ("bcache: Break up struct search") introduced
changes to struct search and s->iop. bypass/bio are fields of struct
data_insert_op now. Update the comment.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 3bf35914bb57..15070412a32e 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -311,11 +311,11 @@ err:
  * data is written it calls bch_journal, and after the keys have been added to
  * the next journal write they're inserted into the btree.
  *
- * It inserts the data in s->cache_bio; bi_sector is used for the key offset,
+ * It inserts the data in op->bio; bi_sector is used for the key offset,
  * and op->inode is used for the key inode.
  *
- * If s->bypass is true, instead of inserting the data it invalidates the
- * region of the cache represented by s->cache_bio and op->inode.
+ * If op->bypass is true, instead of inserting the data it invalidates the
+ * region of the cache represented by op->bio and op->inode.
  */
 void bch_data_insert(struct closure *cl)
 {

From 4e361e020e7220fa8fc812acdca4d08814633f49 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:49 +0800
Subject: [PATCH 317/351] bcache: update comment in sysfs.c

We have struct cached_dev allocated by kzalloc in register_bcache(),
which initializes all the fields of cached_dev with 0s. And commit
ce4c3e19e520 ("bcache: Replace bch_read_string_list() by
__sysfs_match_string()") has remove the string "default".

Update the comment.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 26f035a0c5b9..d2e5c9892d4d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -16,7 +16,7 @@
 #include <linux/sort.h>
 #include <linux/sched/clock.h>
 
-/* Default is -1; we skip past it for struct cached_dev's cache mode */
+/* Default is 0 ("writethrough") */
 static const char * const bch_cache_modes[] = {
 	"writethrough",
 	"writeback",
@@ -25,7 +25,7 @@ static const char * const bch_cache_modes[] = {
 	NULL
 };
 
-/* Default is -1; we skip past it for stop_when_cache_set_failed */
+/* Default is 0 ("auto") */
 static const char * const bch_stop_on_failure_modes[] = {
 	"auto",
 	"always",

From 79b791466e525c98f6aeee9acf5726e7b27f4231 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:50 +0800
Subject: [PATCH 318/351] bcache: do not mark writeback_running too early

A fresh backing device is not attached to any cache_set, and
has no writeback kthread created until first attached to some
cache_set.

But bch_cached_dev_writeback_init run
"
	dc->writeback_running		= true;
	WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING,
			&dc->disk.flags));
"
for any newly formatted backing devices.

For a fresh standalone backing device, we can get something like
following even if no writeback kthread created:
------------------------
/sys/block/bcache0/bcache# cat writeback_running
1
/sys/block/bcache0/bcache# cat writeback_rate_debug
rate:		512.0k/sec
dirty:		0.0k
target:		0.0k
proportional:	0.0k
integral:	0.0k
change:		0.0k/sec
next io:	-15427384ms

The none ZERO fields are misleading as no alive writeback kthread yet.

Set dc->writeback_running false as no writeback thread created in
bch_cached_dev_writeback_init().

We have writeback thread created and woken up in bch_cached_dev_writeback
_start(). Set dc->writeback_running true before bch_writeback_queue()
called, as a writeback thread will check if dc->writeback_running is true
before writing back dirty data, and hung if false detected.

After the change, we can get the following output for a fresh standalone
backing device:
-----------------------
/sys/block/bcache0/bcache$ cat writeback_running
0
/sys/block/bcache0/bcache# cat writeback_rate_debug
rate:		0.0k/sec
dirty:		0.0k
target:		0.0k
proportional:	0.0k
integral:	0.0k
change:		0.0k/sec
next io:	0ms

v1 -> v2:
  Set dc->writeback_running before bch_writeback_queue() called,

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/writeback.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 08c3a9f9676c..1696b212ec4e 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -777,7 +777,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	bch_keybuf_init(&dc->writeback_keys);
 
 	dc->writeback_metadata		= true;
-	dc->writeback_running		= true;
+	dc->writeback_running		= false;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
 	atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -805,6 +805,7 @@ int bch_cached_dev_writeback_start(struct cached_dev *dc)
 		cached_dev_put(dc);
 		return PTR_ERR(dc->writeback_thread);
 	}
+	dc->writeback_running = true;
 
 	WARN_ON(test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
 	schedule_delayed_work(&dc->writeback_rate_update,

From f383ae300c4b6466824fd5c8cbd0a6c4ed2b53d3 Mon Sep 17 00:00:00 2001
From: Shenghui Wang <shhuiw@foxmail.com>
Date: Thu, 13 Dec 2018 22:53:51 +0800
Subject: [PATCH 319/351] bcache: cannot set writeback_running via sysfs if no
 writeback kthread created

"echo 1 > writeback_running" marks writeback_running even if no
writeback kthread created as "d_strtoul(writeback_running)" will simply
set dc-> writeback_running without checking the existence of
dc->writeback_thread.

Add check for setting writeback_running via sysfs: if no writeback
kthread available, reject setting to 1.

v2 -> v3:
  * Make message on wrong assignment more clear.
  * Print name of bcache device instead of name of backing device.

Signed-off-by: Shenghui Wang <shhuiw@foxmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index d2e5c9892d4d..9d5fe12f0c9c 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -384,8 +384,25 @@ STORE(bch_cached_dev)
 	mutex_lock(&bch_register_lock);
 	size = __cached_dev_store(kobj, attr, buf, size);
 
-	if (attr == &sysfs_writeback_running)
-		bch_writeback_queue(dc);
+	if (attr == &sysfs_writeback_running) {
+		/* dc->writeback_running changed in __cached_dev_store() */
+		if (IS_ERR_OR_NULL(dc->writeback_thread)) {
+			/*
+			 * reject setting it to 1 via sysfs if writeback
+			 * kthread is not created yet.
+			 */
+			if (dc->writeback_running) {
+				dc->writeback_running = false;
+				pr_err("%s: failed to run non-existent writeback thread",
+						dc->disk.disk->disk_name);
+			}
+		} else
+			/*
+			 * writeback kthread will check if dc->writeback_running
+			 * is true or false.
+			 */
+			bch_writeback_queue(dc);
+	}
 
 	if (attr == &sysfs_writeback_percent)
 		if (!test_and_set_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))

From cb07ad63682ffcce10e9beaed7828a26780e3df9 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:52 +0800
Subject: [PATCH 320/351] bcache: introduce force_wake_up_gc()

Garbage collection thread starts to work when c->sectors_to_gc is
negative value, otherwise nothing will happen even the gc thread is
woken up by wake_up_gc().

force_wake_up_gc() sets c->sectors_to_gc to -1 before calling
wake_up_gc(), then gc thread may have chance to run if no one else sets
c->sectors_to_gc to a positive value before gc_should_run().

This routine can be called where the gc thread is woken up and required
to run in force.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/btree.h | 18 ++++++++++++++++++
 drivers/md/bcache/sysfs.c | 17 ++---------------
 2 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index a68d6c55783b..d1c72ef64edf 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -266,6 +266,24 @@ static inline void wake_up_gc(struct cache_set *c)
 	wake_up(&c->gc_wait);
 }
 
+static inline void force_wake_up_gc(struct cache_set *c)
+{
+	/*
+	 * Garbage collection thread only works when sectors_to_gc < 0,
+	 * calling wake_up_gc() won't start gc thread if sectors_to_gc is
+	 * not a nagetive value.
+	 * Therefore sectors_to_gc is set to -1 here, before waking up
+	 * gc thread by calling wake_up_gc(). Then gc_should_run() will
+	 * give a chance to permit gc thread to run. "Give a chance" means
+	 * before going into gc_should_run(), there is still possibility
+	 * that c->sectors_to_gc being set to other positive value. So
+	 * this routine won't 100% make sure gc thread will be woken up
+	 * to run.
+	 */
+	atomic_set(&c->sectors_to_gc, -1);
+	wake_up_gc(c);
+}
+
 #define MAP_DONE	0
 #define MAP_CONTINUE	1
 
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 9d5fe12f0c9c..c09748497cdc 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -742,21 +742,8 @@ STORE(__bch_cache_set)
 		bch_cache_accounting_clear(&c->accounting);
 	}
 
-	if (attr == &sysfs_trigger_gc) {
-		/*
-		 * Garbage collection thread only works when sectors_to_gc < 0,
-		 * when users write to sysfs entry trigger_gc, most of time
-		 * they want to forcibly triger gargage collection. Here -1 is
-		 * set to c->sectors_to_gc, to make gc_should_run() give a
-		 * chance to permit gc thread to run. "give a chance" means
-		 * before going into gc_should_run(), there is still chance
-		 * that c->sectors_to_gc being set to other positive value. So
-		 * writing sysfs entry trigger_gc won't always make sure gc
-		 * thread takes effect.
-		 */
-		atomic_set(&c->sectors_to_gc, -1);
-		wake_up_gc(c);
-	}
+	if (attr == &sysfs_trigger_gc)
+		force_wake_up_gc(c);
 
 	if (attr == &sysfs_prune_cache) {
 		struct shrink_control sc;

From 7a671d8ef821bf5743fdff17fae0600648345b03 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:53 +0800
Subject: [PATCH 321/351] bcache: option to automatically run gc thread after
 writeback

The option gc_after_writeback is disabled by default, because garbage
collection will discard SSD data which drops cached data.

Echo 1 into /sys/fs/bcache/<UUID>/internal/gc_after_writeback will
enable this option, which wakes up gc thread when writeback accomplished
and all cached data is clean.

This option is helpful for people who cares writing performance more. In
heavy writing workload, all cached data can be clean only happens when
writeback thread cleans all cached data in I/O idle time. In such
situation a following gc running may help to shrink bcache B+ tree and
discard more clean data, which may be helpful for future writing
requests.

If you are not sure whether this is helpful for your own workload,
please leave it as disabled by default.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/bcache.h    | 14 ++++++++++++++
 drivers/md/bcache/sysfs.c     |  9 +++++++++
 drivers/md/bcache/writeback.c | 27 +++++++++++++++++++++++++++
 drivers/md/bcache/writeback.h |  2 ++
 4 files changed, 52 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 96d2213f279e..fdf75352e16a 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -626,6 +626,20 @@ struct cache_set {
 	/* Where in the btree gc currently is */
 	struct bkey		gc_done;
 
+	/*
+	 * For automatical garbage collection after writeback completed, this
+	 * varialbe is used as bit fields,
+	 * - 0000 0001b (BCH_ENABLE_AUTO_GC): enable gc after writeback
+	 * - 0000 0010b (BCH_DO_AUTO_GC):     do gc after writeback
+	 * This is an optimization for following write request after writeback
+	 * finished, but read hit rate dropped due to clean data on cache is
+	 * discarded. Unless user explicitly sets it via sysfs, it won't be
+	 * enabled.
+	 */
+#define BCH_ENABLE_AUTO_GC	1
+#define BCH_DO_AUTO_GC		2
+	uint8_t			gc_after_writeback;
+
 	/*
 	 * The allocation code needs gc_mark in struct bucket to be correct, but
 	 * it's not while a gc is in progress. Protected by bucket_lock.
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index c09748497cdc..621186b4240f 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -128,6 +128,7 @@ rw_attribute(expensive_debug_checks);
 rw_attribute(cache_replacement_policy);
 rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
+rw_attribute(gc_after_writeback);
 rw_attribute(size);
 
 static ssize_t bch_snprint_string_list(char *buf,
@@ -693,6 +694,7 @@ SHOW(__bch_cache_set)
 	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
 	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
 	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
+	sysfs_printf(gc_after_writeback,	"%i", c->gc_after_writeback);
 	sysfs_printf(io_disable,		"%i",
 		     test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
@@ -793,6 +795,12 @@ STORE(__bch_cache_set)
 	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
 	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
 	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
+	/*
+	 * write gc_after_writeback here may overwrite an already set
+	 * BCH_DO_AUTO_GC, it doesn't matter because this flag will be
+	 * set in next chance.
+	 */
+	sysfs_strtoul_clamp(gc_after_writeback, c->gc_after_writeback, 0, 1);
 
 	return size;
 }
@@ -873,6 +881,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_gc_always_rewrite,
 	&sysfs_btree_shrinker_disabled,
 	&sysfs_copy_gc_enabled,
+	&sysfs_gc_after_writeback,
 	&sysfs_io_disable,
 	NULL
 };
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 1696b212ec4e..73f0efac2b9f 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -17,6 +17,15 @@
 #include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
+static void update_gc_after_writeback(struct cache_set *c)
+{
+	if (c->gc_after_writeback != (BCH_ENABLE_AUTO_GC) ||
+	    c->gc_stats.in_use < BCH_AUTO_GC_DIRTY_THRESHOLD)
+		return;
+
+	c->gc_after_writeback |= BCH_DO_AUTO_GC;
+}
+
 /* Rate limiting */
 static uint64_t __calc_target_rate(struct cached_dev *dc)
 {
@@ -191,6 +200,7 @@ static void update_writeback_rate(struct work_struct *work)
 		if (!set_at_max_writeback_rate(c, dc)) {
 			down_read(&dc->writeback_lock);
 			__update_writeback_rate(dc);
+			update_gc_after_writeback(c);
 			up_read(&dc->writeback_lock);
 		}
 	}
@@ -689,6 +699,23 @@ static int bch_writeback_thread(void *arg)
 				up_write(&dc->writeback_lock);
 				break;
 			}
+
+			/*
+			 * When dirty data rate is high (e.g. 50%+), there might
+			 * be heavy buckets fragmentation after writeback
+			 * finished, which hurts following write performance.
+			 * If users really care about write performance they
+			 * may set BCH_ENABLE_AUTO_GC via sysfs, then when
+			 * BCH_DO_AUTO_GC is set, garbage collection thread
+			 * will be wake up here. After moving gc, the shrunk
+			 * btree and discarded free buckets SSD space may be
+			 * helpful for following write requests.
+			 */
+			if (c->gc_after_writeback ==
+			    (BCH_ENABLE_AUTO_GC|BCH_DO_AUTO_GC)) {
+				c->gc_after_writeback &= ~BCH_DO_AUTO_GC;
+				force_wake_up_gc(c);
+			}
 		}
 
 		up_write(&dc->writeback_lock);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index d2b9fdbc8994..ce63be98a39d 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -11,6 +11,8 @@
 #define WRITEBACK_RATE_UPDATE_SECS_MAX		60
 #define WRITEBACK_RATE_UPDATE_SECS_DEFAULT	5
 
+#define BCH_AUTO_GC_DIRTY_THRESHOLD	50
+
 /*
  * 14 (16384ths) is chosen here as something that each backing device
  * should be a reasonable fraction of the share, and not to blow up

From 009673d02fa92acaa7ed0b1e1389610e4390ba49 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:54 +0800
Subject: [PATCH 322/351] bcache: add MODULE_DESCRIPTION information

This patch moves MODULE_AUTHOR and MODULE_LICENSE to end of super.c, and
add MODULE_DESCRIPTION("Bcache: a Linux block layer cache").

This is preparation for adding module parameters.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 5b59d44656c0..61d3b63fa617 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,9 +25,6 @@
 #include <linux/reboot.h>
 #include <linux/sysfs.h>
 
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
-
 static const char bcache_magic[] = {
 	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
 	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
@@ -2469,3 +2466,7 @@ err:
 
 module_exit(bcache_exit);
 module_init(bcache_init);
+
+MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+MODULE_LICENSE("GPL");

From 9aaf51654672b16566c5fe787da3ca41ebf6d297 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:55 +0800
Subject: [PATCH 323/351] bcache: make cutoff_writeback and
 cutoff_writeback_sync tunable

Currently the cutoff writeback and cutoff writeback sync thresholds are
defined by CUTOFF_WRITEBACK (40) and CUTOFF_WRITEBACK_SYNC (70) as
static values. Most of time these they work fine, but when people want
to do research on bcache writeback mode performance tuning, there is no
chance to modify the soft and hard cutoff writeback values.

This patch introduces two module parameters bch_cutoff_writeback_sync
and bch_cutoff_writeback which permit people to tune the values when
loading bcache.ko. If they are not specified by module loading, current
values CUTOFF_WRITEBACK_SYNC and CUTOFF_WRITEBACK will be used as
default and nothing changes.

When people want to tune this two values,
- cutoff_writeback can be set in range [1, 70]
- cutoff_writeback_sync can be set in range [1, 90]
- cutoff_writeback always <= cutoff_writeback_sync

The default values are strongly recommended to most of users for most of
workloads. Anyway, if people wants to take their own risk to do research
on new writeback cutoff tuning for their own workload, now they can make
it.

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/super.c     | 40 +++++++++++++++++++++++++++++++++++
 drivers/md/bcache/sysfs.c     |  7 ++++++
 drivers/md/bcache/writeback.h | 10 +++++++--
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 61d3b63fa617..4dee119c3664 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -25,6 +25,9 @@
 #include <linux/reboot.h>
 #include <linux/sysfs.h>
 
+unsigned int bch_cutoff_writeback;
+unsigned int bch_cutoff_writeback_sync;
+
 static const char bcache_magic[] = {
 	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
 	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
@@ -2420,6 +2423,32 @@ static void bcache_exit(void)
 	mutex_destroy(&bch_register_lock);
 }
 
+/* Check and fixup module parameters */
+static void check_module_parameters(void)
+{
+	if (bch_cutoff_writeback_sync == 0)
+		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
+	else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
+		pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
+			bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
+		bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
+	}
+
+	if (bch_cutoff_writeback == 0)
+		bch_cutoff_writeback = CUTOFF_WRITEBACK;
+	else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
+		pr_warn("set bch_cutoff_writeback (%u) to max value %u",
+			bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
+		bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
+	}
+
+	if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
+		pr_warn("set bch_cutoff_writeback (%u) to %u",
+			bch_cutoff_writeback, bch_cutoff_writeback_sync);
+		bch_cutoff_writeback = bch_cutoff_writeback_sync;
+	}
+}
+
 static int __init bcache_init(void)
 {
 	static const struct attribute *files[] = {
@@ -2428,6 +2457,8 @@ static int __init bcache_init(void)
 		NULL
 	};
 
+	check_module_parameters();
+
 	mutex_init(&bch_register_lock);
 	init_waitqueue_head(&unregister_wait);
 	register_reboot_notifier(&reboot);
@@ -2464,9 +2495,18 @@ err:
 	return -ENOMEM;
 }
 
+/*
+ * Module hooks
+ */
 module_exit(bcache_exit);
 module_init(bcache_init);
 
+module_param(bch_cutoff_writeback, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
+
+module_param(bch_cutoff_writeback_sync, uint, 0);
+MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
+
 MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 621186b4240f..482b128b3e9d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -88,6 +88,8 @@ read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
 read_attribute(io_errors);
 read_attribute(congested);
+read_attribute(cutoff_writeback);
+read_attribute(cutoff_writeback_sync);
 rw_attribute(congested_read_threshold_us);
 rw_attribute(congested_write_threshold_us);
 
@@ -686,6 +688,9 @@ SHOW(__bch_cache_set)
 	sysfs_print(congested_write_threshold_us,
 		    c->congested_write_threshold_us);
 
+	sysfs_print(cutoff_writeback, bch_cutoff_writeback);
+	sysfs_print(cutoff_writeback_sync, bch_cutoff_writeback_sync);
+
 	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
 	sysfs_printf(verify,			"%i", c->verify);
 	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
@@ -883,6 +888,8 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_copy_gc_enabled,
 	&sysfs_gc_after_writeback,
 	&sysfs_io_disable,
+	&sysfs_cutoff_writeback,
+	&sysfs_cutoff_writeback_sync,
 	NULL
 };
 KTYPE(bch_cache_set_internal);
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index ce63be98a39d..6a743d3bb338 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,9 @@
 #define CUTOFF_WRITEBACK	40
 #define CUTOFF_WRITEBACK_SYNC	70
 
+#define CUTOFF_WRITEBACK_MAX		70
+#define CUTOFF_WRITEBACK_SYNC_MAX	90
+
 #define MAX_WRITEBACKS_IN_PASS  5
 #define MAX_WRITESIZE_IN_PASS   5000	/* *512b */
 
@@ -55,6 +58,9 @@ static inline bool bcache_dev_stripe_dirty(struct cached_dev *dc,
 	}
 }
 
+extern unsigned int bch_cutoff_writeback;
+extern unsigned int bch_cutoff_writeback_sync;
+
 static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned int cache_mode, bool would_skip)
 {
@@ -62,7 +68,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
-	    in_use > CUTOFF_WRITEBACK_SYNC)
+	    in_use > bch_cutoff_writeback_sync)
 		return false;
 
 	if (dc->partial_stripes_expensive &&
@@ -75,7 +81,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 
 	return (op_is_sync(bio->bi_opf) ||
 		bio->bi_opf & (REQ_META|REQ_PRIO) ||
-		in_use <= CUTOFF_WRITEBACK);
+		in_use <= bch_cutoff_writeback);
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)

From cc38ca7ed54a1df04ae9f23614b348ebfc918029 Mon Sep 17 00:00:00 2001
From: Coly Li <colyli@suse.de>
Date: Thu, 13 Dec 2018 22:53:56 +0800
Subject: [PATCH 324/351] bcache: set writeback_percent in a flexible range

Because CUTOFF_WRITEBACK is defined as 40, so before the changes of
dynamic cutoff writeback values, writeback_percent is limited to [0,
CUTOFF_WRITEBACK]. Any value larger than CUTOFF_WRITEBACK will be fixed
up to 40.

Now cutof writeback limit is a dynamic value bch_cutoff_writeback, so
the range of writeback_percent can be a more flexible range as [0,
bch_cutoff_writeback]. The flexibility is, it can be expended to a
larger or smaller range than [0, 40], depends on how value
bch_cutoff_writeback is specified.

The default value is still strongly recommended to most of users for
most of workloads. But for people who want to do research on bcache
writeback perforamnce tuning, they may have chance to specify more
flexible writeback_percent in range [0, 70].

Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/sysfs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 482b128b3e9d..557a8a3270a1 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -267,7 +267,8 @@ STORE(__cached_dev)
 	d_strtoul(writeback_running);
 	d_strtoul(writeback_delay);
 
-	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
+			    0, bch_cutoff_writeback);
 
 	if (attr == &sysfs_writeback_rate) {
 		ssize_t ret;

From e78bd0d26f7396c7bd17be60d9686be8ef8e453a Mon Sep 17 00:00:00 2001
From: Guoju Fang <fangguoju@gmail.com>
Date: Thu, 13 Dec 2018 22:53:57 +0800
Subject: [PATCH 325/351] bcache: print number of keys in
 trace_bcache_journal_write

Sometimes flush journal may be very frequent, so it's useful to dump
number of keys every time write journal.

Signed-off-by: Guoju Fang <fangguoju@gmail.com>
Signed-off-by: Coly Li <colyli@suse.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/bcache/journal.c   |  2 +-
 include/trace/events/bcache.h | 27 ++++++++++++++++++++++++---
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 522c7426f3a0..b2fd412715b1 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -663,7 +663,7 @@ static void journal_write_unlocked(struct closure *cl)
 				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
 		bch_bio_map(bio, w->data);
 
-		trace_bcache_journal_write(bio);
+		trace_bcache_journal_write(bio, w->data->keys);
 		bio_list_add(&list, bio);
 
 		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
index 2cbd6e42ad83..e4526f85c19d 100644
--- a/include/trace/events/bcache.h
+++ b/include/trace/events/bcache.h
@@ -221,9 +221,30 @@ DEFINE_EVENT(cache_set, bcache_journal_entry_full,
 	TP_ARGS(c)
 );
 
-DEFINE_EVENT(bcache_bio, bcache_journal_write,
-	TP_PROTO(struct bio *bio),
-	TP_ARGS(bio)
+TRACE_EVENT(bcache_journal_write,
+	TP_PROTO(struct bio *bio, u32 keys),
+	TP_ARGS(bio, keys),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__field(u32,		nr_keys			)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio_dev(bio);
+		__entry->sector		= bio->bi_iter.bi_sector;
+		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
+		__entry->nr_keys	= keys;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u keys %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector, __entry->nr_sector,
+		  __entry->nr_keys)
 );
 
 /* Btree */

From 092ff0520070fad8407b196f3bb6156ce77a6f98 Mon Sep 17 00:00:00 2001
From: Sagi Grimberg <sagi@grimberg.me>
Date: Thu, 13 Dec 2018 12:34:07 -0800
Subject: [PATCH 326/351] nvme: fix kernel paging oops

free the controller discard_page correctly.

Fixes: cb5b7262b011 ("nvme: provide fallback for discard alloc failure")
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4d8ee7186268..136512e8ba58 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3596,7 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
-	kfree(ctrl->discard_page);
+	__free_page(ctrl->discard_page);
 
 	if (subsys) {
 		mutex_lock(&subsys->lock);

From 6c210aa596d0ecf6f3eea65c02ac807877385a18 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:09 +0100
Subject: [PATCH 327/351] block: remove the bio_phys_segments export

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 036e3f0cc736..fa1ea2ac66a8 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -571,14 +571,13 @@ void bio_put(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_put);
 
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
+int bio_phys_segments(struct request_queue *q, struct bio *bio)
 {
 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, bio);
 
 	return bio->bi_phys_segments;
 }
-EXPORT_SYMBOL(bio_phys_segments);
 
 /**
  * 	__bio_clone_fast - clone a bio that shares the original bio's biovec

From 637b60ade37ed5465c038c03f0fd1deabadac49c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:10 +0100
Subject: [PATCH 328/351] block: remove the blk_recount_segments export

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 9da5629d0887..e7f1c6cf0167 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -389,7 +389,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio)
 
 	bio_set_flag(bio, BIO_SEG_VALID);
 }
-EXPORT_SYMBOL(blk_recount_segments);
 
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)

From 0374e1132217711bc2e920cde877dd7fc3dbd2d9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:11 +0100
Subject: [PATCH 329/351] block: remove the unused bio_iov_iter_get_pages
 export

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index fa1ea2ac66a8..e9c6f1d6fcbd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -901,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
 
 static void submit_bio_wait_endio(struct bio *bio)
 {

From a45eb575cdb44d8b493fead6352f17bcc36da66a Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:12 +0100
Subject: [PATCH 330/351] block: remove the unused bio_set_pages_dirty and
 bio_check_pages_dirty exports

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index e9c6f1d6fcbd..c288b9057042 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1590,7 +1590,6 @@ void bio_set_pages_dirty(struct bio *bio)
 			set_page_dirty_lock(bvec->bv_page);
 	}
 }
-EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 static void bio_release_pages(struct bio *bio)
 {
@@ -1660,7 +1659,6 @@ defer:
 	spin_unlock_irqrestore(&bio_dirty_lock, flags);
 	schedule_work(&bio_dirty_work);
 }
-EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 void update_io_ticks(struct hd_struct *part, unsigned long now)
 {

From 74030653f0b9736f179c1c4e713ba1f2070aa0dd Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:13 +0100
Subject: [PATCH 331/351] block: remove the bioset_integrity_free export

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 290af497997b..7b30ff5b3af4 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -460,7 +460,6 @@ void bioset_integrity_free(struct bio_set *bs)
 	mempool_exit(&bs->bio_integrity_pool);
 	mempool_exit(&bs->bvec_integrity_pool);
 }
-EXPORT_SYMBOL(bioset_integrity_free);
 
 void __init bio_integrity_init(void)
 {

From 4c9770c90fc5b6d6b6d190d108c061015f5804f7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 13 Dec 2018 21:32:14 +0100
Subject: [PATCH 332/351] block: remove the bio_integrity_advance export

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/bio-integrity.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 7b30ff5b3af4..1b633a3526d4 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
 	bip->bip_iter.bi_sector += bytes_done >> 9;
 	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
 }
-EXPORT_SYMBOL(bio_integrity_advance);
 
 /**
  * bio_integrity_trim - Trim integrity vector

From 7f556a44e61d0b62d78db9a2662a5f0daef010f2 Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Fri, 14 Dec 2018 09:28:18 +0800
Subject: [PATCH 333/351] blk-mq: refactor the code of issue request directly

Merge blk_mq_try_issue_directly and __blk_mq_try_issue_directly
into one interface to unify the interfaces to issue requests
directly. The merged interface takes over the requests totally,
it could insert, end or do nothing based on the return value of
.queue_rq and 'bypass' parameter. Then caller needn't any other
handling any more and then code could be cleaned up.

And also the commit c616cbee ( blk-mq: punt failed direct issue
to dispatch list ) always inserts requests to hctx dispatch list
whenever get a BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE, this is
overkill and will harm the merging. We just need to do that for
the requests that has been through .queue_rq. This patch also
could fix this.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 103 ++++++++++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 49 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9690f4f8de7e..af4dc8227954 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1792,78 +1792,83 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+static blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
 						blk_qc_t *cookie,
-						bool bypass_insert, bool last)
+						bool bypass, bool last)
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
+	blk_status_t ret = BLK_STS_RESOURCE;
+	int srcu_idx;
+	bool force = false;
 
+	hctx_lock(hctx, &srcu_idx);
 	/*
-	 * RCU or SRCU read lock is needed before checking quiesced flag.
+	 * hctx_lock is needed before checking quiesced flag.
 	 *
-	 * When queue is stopped or quiesced, ignore 'bypass_insert' from
-	 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
-	 * and avoid driver to try to dispatch again.
+	 * When queue is stopped or quiesced, ignore 'bypass', insert
+	 * and return BLK_STS_OK to caller, and avoid driver to try to
+	 * dispatch again.
 	 */
-	if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
+	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) {
 		run_queue = false;
-		bypass_insert = false;
-		goto insert;
+		bypass = false;
+		goto out_unlock;
 	}
 
-	if (q->elevator && !bypass_insert)
-		goto insert;
+	if (unlikely(q->elevator && !bypass))
+		goto out_unlock;
 
 	if (!blk_mq_get_dispatch_budget(hctx))
-		goto insert;
+		goto out_unlock;
 
 	if (!blk_mq_get_driver_tag(rq)) {
 		blk_mq_put_dispatch_budget(hctx);
-		goto insert;
+		goto out_unlock;
 	}
 
-	return __blk_mq_issue_directly(hctx, rq, cookie, last);
-insert:
-	if (bypass_insert)
-		return BLK_STS_RESOURCE;
-
-	blk_mq_request_bypass_insert(rq, run_queue);
-	return BLK_STS_OK;
-}
-
-static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
-		struct request *rq, blk_qc_t *cookie)
-{
-	blk_status_t ret;
-	int srcu_idx;
-
-	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
-	hctx_lock(hctx, &srcu_idx);
-
-	ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
-	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
-		blk_mq_request_bypass_insert(rq, true);
-	else if (ret != BLK_STS_OK)
-		blk_mq_end_request(rq, ret);
-
+	/*
+	 * Always add a request that has been through
+	 *.queue_rq() to the hardware dispatch list.
+	 */
+	force = true;
+	ret = __blk_mq_issue_directly(hctx, rq, cookie, last);
+out_unlock:
 	hctx_unlock(hctx, srcu_idx);
+	switch (ret) {
+	case BLK_STS_OK:
+		break;
+	case BLK_STS_DEV_RESOURCE:
+	case BLK_STS_RESOURCE:
+		if (force) {
+			blk_mq_request_bypass_insert(rq, run_queue);
+			/*
+			 * We have to return BLK_STS_OK for the DM
+			 * to avoid livelock. Otherwise, we return
+			 * the real result to indicate whether the
+			 * request is direct-issued successfully.
+			 */
+			ret = bypass ? BLK_STS_OK : ret;
+		} else if (!bypass) {
+			blk_mq_sched_insert_request(rq, false,
+						    run_queue, false);
+		}
+		break;
+	default:
+		if (!bypass)
+			blk_mq_end_request(rq, ret);
+		break;
+	}
+
+	return ret;
 }
 
 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 {
-	blk_status_t ret;
-	int srcu_idx;
-	blk_qc_t unused_cookie;
-	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	blk_qc_t unused;
 
-	hctx_lock(hctx, &srcu_idx);
-	ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
-	hctx_unlock(hctx, srcu_idx);
-
-	return ret;
+	return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, last);
 }
 
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2004,13 +2009,13 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		if (same_queue_rq) {
 			data.hctx = same_queue_rq->mq_hctx;
 			blk_mq_try_issue_directly(data.hctx, same_queue_rq,
-					&cookie);
+					&cookie, false, true);
 		}
 	} else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
 			!data.hctx->dispatch_busy)) {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);
-		blk_mq_try_issue_directly(data.hctx, rq, &cookie);
+		blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true);
 	} else {
 		blk_mq_put_ctx(data.ctx);
 		blk_mq_bio_to_request(rq, bio);

From 5b7a6f128aad761b471ca0ff620b4841b38e596f Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Fri, 14 Dec 2018 09:28:19 +0800
Subject: [PATCH 334/351] blk-mq: issue directly with bypass 'false' in
 blk_mq_sched_insert_requests

It is not necessary to issue request directly with bypass 'true'
in blk_mq_sched_insert_requests and handle the non-issued requests
itself. Just set bypass to 'false' and let blk_mq_try_issue_directly
handle them totally. Remove the blk_rq_can_direct_dispatch check,
because blk_mq_try_issue_directly can handle it well.If request is
direct-issued unsuccessfully, insert the reset.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c |  8 +++-----
 block/blk-mq.c       | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f096d8989773..5b4d52d9cba2 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -417,12 +417,10 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 		 * busy in case of 'none' scheduler, and this way may save
 		 * us one extra enqueue & dequeue to sw queue.
 		 */
-		if (!hctx->dispatch_busy && !e && !run_queue_async) {
+		if (!hctx->dispatch_busy && !e && !run_queue_async)
 			blk_mq_try_issue_list_directly(hctx, list);
-			if (list_empty(list))
-				return;
-		}
-		blk_mq_insert_requests(hctx, ctx, list);
+		else
+			blk_mq_insert_requests(hctx, ctx, list);
 	}
 
 	blk_mq_run_hw_queue(hctx, run_queue_async);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index af4dc8227954..90361fe758f8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1874,22 +1874,20 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		struct list_head *list)
 {
+	blk_qc_t unused;
+	blk_status_t ret = BLK_STS_OK;
+
 	while (!list_empty(list)) {
-		blk_status_t ret;
 		struct request *rq = list_first_entry(list, struct request,
 				queuelist);
 
 		list_del_init(&rq->queuelist);
-		ret = blk_mq_request_issue_directly(rq, list_empty(list));
-		if (ret != BLK_STS_OK) {
-			if (ret == BLK_STS_RESOURCE ||
-					ret == BLK_STS_DEV_RESOURCE) {
-				blk_mq_request_bypass_insert(rq,
+		if (ret == BLK_STS_OK)
+			ret = blk_mq_try_issue_directly(hctx, rq, &unused,
+							false,
 							list_empty(list));
-				break;
-			}
-			blk_mq_end_request(rq, ret);
-		}
+		else
+			blk_mq_sched_insert_request(rq, false, true, false);
 	}
 
 	/*
@@ -1897,7 +1895,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 	 * the driver there was more coming, but that turned out to
 	 * be a lie.
 	 */
-	if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
+	if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs)
 		hctx->queue->mq_ops->commit_rqs(hctx);
 }
 

From d6a51a97c0b2e21fec224746c2683ff739bcf4ae Mon Sep 17 00:00:00 2001
From: Jianchao Wang <jianchao.w.wang@oracle.com>
Date: Fri, 14 Dec 2018 09:28:20 +0800
Subject: [PATCH 335/351] blk-mq: replace and kill
 blk_mq_request_issue_directly

Replace blk_mq_request_issue_directly with blk_mq_try_issue_directly
in blk_insert_cloned_request and kill it as nobody uses it any more.

Signed-off-by: Jianchao Wang <jianchao.w.wang@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 4 +++-
 block/blk-mq.c   | 9 +--------
 block/blk-mq.h   | 6 ++++--
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 268d2b8e9843..fa661bac40af 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1240,6 +1240,8 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
  */
 blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
+	blk_qc_t unused;
+
 	if (blk_cloned_rq_check_limits(q, rq))
 		return BLK_STS_IOERR;
 
@@ -1255,7 +1257,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
 	 * bypass a potential scheduler on the bottom device for
 	 * insert.
 	 */
-	return blk_mq_request_issue_directly(rq, true);
+	return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true);
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90361fe758f8..2d3a29eb58ca 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1792,7 +1792,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
 						blk_qc_t *cookie,
 						bool bypass, bool last)
@@ -1864,13 +1864,6 @@ out_unlock:
 	return ret;
 }
 
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
-{
-	blk_qc_t unused;
-
-	return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, last);
-}
-
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 		struct list_head *list)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 0c9c9ea2fefe..b63a0de8a07a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -68,8 +68,10 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 
-/* Used by blk_insert_cloned_request() to issue request directly */
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
+blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
+						struct request *rq,
+						blk_qc_t *cookie,
+						bool bypass, bool last);
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				    struct list_head *list);
 

From d04c406f29d9f4dbcb5eb5aa79ce0445c7e9d652 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 14 Dec 2018 17:21:22 +0100
Subject: [PATCH 336/351] block: clear REQ_HIPRI if polling is not supported

This prevents a HIPRI bio from being submitted through a stacking
driver that does not support polling and thus won't poll for I/O
completion.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index fa661bac40af..c78042975737 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -921,6 +921,9 @@ generic_make_request_checks(struct bio *bio)
 		}
 	}
 
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+		bio->bi_opf &= ~REQ_HIPRI;
+
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 		if (!blk_queue_discard(q))

From e7cc005fef03d2b13246d800e497418d570ad6da Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Sun, 16 Dec 2018 14:08:18 +0800
Subject: [PATCH 337/351] aoe: add __exit annotation

Add __exit annotation to cleanup helper which
is only called once in the module.

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/aoe/aoemain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index 251482066977..1e4e2971171c 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -24,7 +24,7 @@ static void discover_timer(struct timer_list *t)
 	aoecmd_cfg(0xffff, 0xff);
 }
 
-static void
+static void __exit
 aoe_exit(void)
 {
 	del_timer_sync(&timer);

From 38a3499f6d0cb15bd673e517b0656807e22bfd24 Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Sun, 16 Dec 2018 17:35:00 +0800
Subject: [PATCH 338/351] block: loop: check error using IS_ERR instead of
 IS_ERR_OR_NULL in loop_add()

blk_mq_init_queue() will not return NULL pointer to its caller,
so it's better to replace IS_ERR_OR_NULL using IS_ERR in loop_add().

If in the future things change to check NULL pointer inside loop_add(),
we should return -ENOMEM as return code instead of PTR_ERR(NULL).

Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0770004616de..0939f36548c9 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1915,7 +1915,7 @@ static int loop_add(struct loop_device **l, int i)
 		goto out_free_idr;
 
 	lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
-	if (IS_ERR_OR_NULL(lo->lo_queue)) {
+	if (IS_ERR(lo->lo_queue)) {
 		err = PTR_ERR(lo->lo_queue);
 		goto out_cleanup_tags;
 	}

From f9824952ee1cd02ae1a74e35e0e8653f8a4db772 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Fri, 30 Nov 2018 14:36:24 +0900
Subject: [PATCH 339/351] block: update sysfs documentation

Add the description of the zoned, nr_zones and chunk_sectors sysfs queue
attributes to Documentation/block/queue-sysfs.txt. The description of
the zoned and chunk_sector attributes are mostly copied from
ABI/testing/sysfs-block (added a typo fix). While at it, also fix a
typo in the description of the io_poll_delay attribute.

nr_zones description is also added to ABI/testing/sysfs-block and
contact email address updated for the zoned attribute.

Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/ABI/testing/sysfs-block | 12 +++++++++--
 Documentation/block/queue-sysfs.txt   | 29 ++++++++++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index dea212db9df3..7710d4022b19 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -244,7 +244,7 @@ Description:
 
 What:		/sys/block/<disk>/queue/zoned
 Date:		September 2016
-Contact:	Damien Le Moal <damien.lemoal@hgst.com>
+Contact:	Damien Le Moal <damien.lemoal@wdc.com>
 Description:
 		zoned indicates if the device is a zoned block device
 		and the zone model of the device if it is indeed zoned.
@@ -259,6 +259,14 @@ Description:
 		zone commands, they will be treated as regular block
 		devices and zoned will report "none".
 
+What:		/sys/block/<disk>/queue/nr_zones
+Date:		November 2018
+Contact:	Damien Le Moal <damien.lemoal@wdc.com>
+Description:
+		nr_zones indicates the total number of zones of a zoned block
+		device ("host-aware" or "host-managed" zone model). For regular
+		block devices, the value is always 0.
+
 What:		/sys/block/<disk>/queue/chunk_sectors
 Date:		September 2016
 Contact:	Hannes Reinecke <hare@suse.com>
@@ -268,6 +276,6 @@ Description:
 		indicates the size in 512B sectors of the RAID volume
 		stripe segment. For a zoned block device, either
 		host-aware or host-managed, chunk_sectors indicates the
-		size of 512B sectors of the zones of the device, with
+		size in 512B sectors of the zones of the device, with
 		the eventual exception of the last zone of the device
 		which may be smaller.
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 2c1e67058fd3..39e286d7afc9 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -64,7 +64,7 @@ guess, the kernel will put the process issuing IO to sleep for an amount
 of time, before entering a classic poll loop. This mode might be a
 little slower than pure classic polling, but it will be more efficient.
 If set to a value larger than 0, the kernel will put the process issuing
-IO to sleep for this amont of microseconds before entering classic
+IO to sleep for this amount of microseconds before entering classic
 polling.
 
 iostats (RW)
@@ -194,4 +194,31 @@ blk-throttle makes decision based on the samplings. Lower time means cgroups
 have more smooth throughput, but higher CPU overhead. This exists only when
 CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
 
+zoned (RO)
+----------
+This indicates if the device is a zoned block device and the zone model of the
+device if it is indeed zoned. The possible values indicated by zoned are
+"none" for regular block devices and "host-aware" or "host-managed" for zoned
+block devices. The characteristics of host-aware and host-managed zoned block
+devices are described in the ZBC (Zoned Block Commands) and ZAC
+(Zoned Device ATA Command Set) standards. These standards also define the
+"drive-managed" zone model. However, since drive-managed zoned block devices
+do not support zone commands, they will be treated as regular block devices
+and zoned will report "none".
+
+nr_zones (RO)
+-------------
+For zoned block devices (zoned attribute indicating "host-managed" or
+"host-aware"), this indicates the total number of zones of the device.
+This is always 0 for regular block devices.
+
+chunk_sectors (RO)
+------------------
+This has different meaning depending on the type of the block device.
+For a RAID device (dm-raid), chunk_sectors indicates the size in 512B sectors
+of the RAID volume stripe segment. For a zoned block device, either host-aware
+or host-managed, chunk_sectors indicates the size in 512B sectors of the zones
+of the device, with the eventual exception of the last zone of the device which
+may be smaller.
+
 Jens Axboe <jens.axboe@oracle.com>, February 2009

From cc56694f132a8f5fa9334e3afe990de8c3378866 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 09:46:00 +0800
Subject: [PATCH 340/351] blk-mq-debugfs: support rq_qos

blk-mq-debugfs has been proved as very helpful for debug some
tough issues, such as IO hang.

We have seen blk-wbt related IO hang several times, even inside
Red Hat BZ, there is such report not sovled yet, so this patch
adds support debugfs on rq_qos.

Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 54 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h | 17 +++++++++++++
 block/blk-rq-qos.c     |  2 ++
 block/blk-rq-qos.h     | 24 +++++++++++++++++++
 include/linux/blkdev.h |  1 +
 5 files changed, 98 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index a32bb79d6c95..2793e91bc7a4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -23,6 +23,7 @@
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-tag.h"
+#include "blk-rq-qos.h"
 
 static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
 {
@@ -856,6 +857,15 @@ int blk_mq_debugfs_register(struct request_queue *q)
 			goto err;
 	}
 
+	if (q->rq_qos) {
+		struct rq_qos *rqos = q->rq_qos;
+
+		while (rqos) {
+			blk_mq_debugfs_register_rqos(rqos);
+			rqos = rqos->next;
+		}
+	}
+
 	return 0;
 
 err:
@@ -978,6 +988,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
 	q->sched_debugfs_dir = NULL;
 }
 
+void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
+{
+	debugfs_remove_recursive(rqos->debugfs_dir);
+	rqos->debugfs_dir = NULL;
+}
+
+int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+{
+	struct request_queue *q = rqos->q;
+	const char *dir_name = rq_qos_id_to_name(rqos->id);
+
+	if (!q->debugfs_dir)
+		return -ENOENT;
+
+	if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
+		return 0;
+
+	if (!q->rqos_debugfs_dir) {
+		q->rqos_debugfs_dir = debugfs_create_dir("rqos",
+							 q->debugfs_dir);
+		if (!q->rqos_debugfs_dir)
+			return -ENOMEM;
+	}
+
+	rqos->debugfs_dir = debugfs_create_dir(dir_name,
+					       rqos->q->rqos_debugfs_dir);
+	if (!rqos->debugfs_dir)
+		return -ENOMEM;
+
+	if (!debugfs_create_files(rqos->debugfs_dir, rqos,
+				  rqos->ops->debugfs_attrs))
+		goto err;
+	return 0;
+ err:
+	blk_mq_debugfs_unregister_rqos(rqos);
+	return -ENOMEM;
+}
+
+void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
+{
+	debugfs_remove_recursive(q->rqos_debugfs_dir);
+	q->rqos_debugfs_dir = NULL;
+}
+
 int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx)
 {
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a9160be12be0..8c9012a578c1 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q);
 int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 				       struct blk_mq_hw_ctx *hctx);
 void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
+
+int blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
+void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
+void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
 #else
 static inline int blk_mq_debugfs_register(struct request_queue *q)
 {
@@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
 static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
 {
 }
+
+static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
+{
+	return 0;
+}
+
+static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
+{
+}
+
+static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
+{
+}
 #endif
 
 #ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index e932ef9d2718..d169d7188fa6 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -264,6 +264,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
 
 void rq_qos_exit(struct request_queue *q)
 {
+	blk_mq_debugfs_unregister_queue_rqos(q);
+
 	while (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
 		q->rq_qos = rqos->next;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 8678875de420..3c85f26d3846 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -7,6 +7,10 @@
 #include <linux/atomic.h>
 #include <linux/wait.h>
 
+#include "blk-mq-debugfs.h"
+
+struct blk_mq_debugfs_attr;
+
 enum rq_qos_id {
 	RQ_QOS_WBT,
 	RQ_QOS_CGROUP,
@@ -22,6 +26,9 @@ struct rq_qos {
 	struct request_queue *q;
 	enum rq_qos_id id;
 	struct rq_qos *next;
+#ifdef CONFIG_BLK_DEBUG_FS
+	struct dentry *debugfs_dir;
+#endif
 };
 
 struct rq_qos_ops {
@@ -33,6 +40,7 @@ struct rq_qos_ops {
 	void (*done_bio)(struct rq_qos *, struct bio *);
 	void (*cleanup)(struct rq_qos *, struct bio *);
 	void (*exit)(struct rq_qos *);
+	const struct blk_mq_debugfs_attr *debugfs_attrs;
 };
 
 struct rq_depth {
@@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
 	return rq_qos_id(q, RQ_QOS_CGROUP);
 }
 
+static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
+{
+	switch (id) {
+	case RQ_QOS_WBT:
+		return "wbt";
+	case RQ_QOS_CGROUP:
+		return "cgroup";
+	}
+	return "unknown";
+}
+
 static inline void rq_wait_init(struct rq_wait *rq_wait)
 {
 	atomic_set(&rq_wait->inflight, 0);
@@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
 {
 	rqos->next = q->rq_qos;
 	q->rq_qos = rqos;
+
+	if (rqos->ops->debugfs_attrs)
+		blk_mq_debugfs_register_rqos(rqos);
 }
 
 static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
@@ -91,6 +113,8 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
 		}
 		prev = cur;
 	}
+
+	blk_mq_debugfs_unregister_rqos(rqos);
 }
 
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 81f1b105946b..45552e6eae1e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -560,6 +560,7 @@ struct request_queue {
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry		*debugfs_dir;
 	struct dentry		*sched_debugfs_dir;
+	struct dentry		*rqos_debugfs_dir;
 #endif
 
 	bool			mq_sysfs_init_done;

From d19afebca47602bd9909944eb0ee406dce66174d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 09:46:01 +0800
Subject: [PATCH 341/351] blk-wbt: export internal state via debugfs

This information is helpful to either investigate issues, or understand
wbt's internal behaviour.

Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Omar Sandoval <osandov@fb.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-wbt.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 463e4eb80287..f0c56649775f 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -715,6 +715,94 @@ void wbt_disable_default(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(wbt_disable_default);
 
+#ifdef CONFIG_BLK_DEBUG_FS
+static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%llu\n", rwb->cur_win_nsec);
+	return 0;
+}
+
+static int wbt_enabled_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%d\n", rwb->enable_state);
+	return 0;
+}
+
+static int wbt_id_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+
+	seq_printf(m, "%u\n", rqos->id);
+	return 0;
+}
+
+static int wbt_inflight_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+	int i;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++)
+		seq_printf(m, "%d: inflight %d\n", i,
+			   atomic_read(&rwb->rq_wait[i].inflight));
+	return 0;
+}
+
+static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%lu\n", rwb->min_lat_nsec);
+	return 0;
+}
+
+static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%u\n", rwb->unknown_cnt);
+	return 0;
+}
+
+static int wbt_normal_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%u\n", rwb->wb_normal);
+	return 0;
+}
+
+static int wbt_background_show(void *data, struct seq_file *m)
+{
+	struct rq_qos *rqos = data;
+	struct rq_wb *rwb = RQWB(rqos);
+
+	seq_printf(m, "%u\n", rwb->wb_background);
+	return 0;
+}
+
+static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
+	{"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
+	{"enabled", 0400, wbt_enabled_show},
+	{"id", 0400, wbt_id_show},
+	{"inflight", 0400, wbt_inflight_show},
+	{"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
+	{"unknown_cnt", 0400, wbt_unknown_cnt_show},
+	{"wb_normal", 0400, wbt_normal_show},
+	{"wb_background", 0400, wbt_background_show},
+	{},
+};
+#endif
+
 static struct rq_qos_ops wbt_rqos_ops = {
 	.throttle = wbt_wait,
 	.issue = wbt_issue,
@@ -723,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = {
 	.done = wbt_done,
 	.cleanup = wbt_cleanup,
 	.exit = wbt_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+	.debugfs_attrs = wbt_debugfs_attrs,
+#endif
 };
 
 int wbt_init(struct request_queue *q)

From 07b35eb5a364fa59f88f65e6c786192f2c9163be Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 18:42:45 +0800
Subject: [PATCH 342/351] blk-mq: fix allocation for queue mapping table

Type of each element in queue mapping table is 'unsigned int,
intead of 'struct blk_mq_queue_map)', so fix it.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2d3a29eb58ca..313f28b2d079 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3019,7 +3019,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	ret = -ENOMEM;
 	for (i = 0; i < set->nr_maps; i++) {
 		set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
-						  sizeof(struct blk_mq_queue_map),
+						  sizeof(set->map[i].mq_map[0]),
 						  GFP_KERNEL, set->numa_node);
 		if (!set->map[i].mq_map)
 			goto out_free_mq_map;

From 346fc1089e5d4734990b4b6c3cd0cdeae9ae482d Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 18:42:48 +0800
Subject: [PATCH 343/351] blk-mq: export hctx->type in debugfs instead of sysfs

Now we only export hctx->type via sysfs, and there isn't such info
in hctx entry under debugfs. We often use debugfs only to diagnose
queue mapping issue, so add the support in debugfs.

Queue mapping becomes a bit more complicated after multiple queue
mapping is supported, we may write blktest to verify if queue mapping
is valid based on blk-mq-debugfs.

Given not necessary to export hctx->type twice, so remove the export
from sysfs.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 16 ++++++++++++++++
 block/blk-mq-sysfs.c   | 17 -----------------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 2793e91bc7a4..1e12033be9ea 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -447,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
+static const char *const hctx_types[] = {
+	[HCTX_TYPE_DEFAULT]	= "default",
+	[HCTX_TYPE_READ]	= "read",
+	[HCTX_TYPE_POLL]	= "poll",
+};
+
+static int hctx_type_show(void *data, struct seq_file *m)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
+	seq_printf(m, "%s\n", hctx_types[hctx->type]);
+	return 0;
+}
+
 static int hctx_ctx_map_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
@@ -799,6 +814,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
+	{"type", 0400, hctx_type_show},
 	{},
 };
 
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 9c2df137256a..3f9c3f4ac44c 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -173,18 +173,6 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
-static const char *const hctx_types[] = {
-	[HCTX_TYPE_DEFAULT]	= "default",
-	[HCTX_TYPE_READ]	= "read",
-	[HCTX_TYPE_POLL]	= "poll",
-};
-
-static ssize_t blk_mq_hw_sysfs_type_show(struct blk_mq_hw_ctx *hctx, char *page)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES);
-	return sprintf(page, "%s\n", hctx_types[hctx->type]);
-}
-
 static struct attribute *default_ctx_attrs[] = {
 	NULL,
 };
@@ -201,16 +189,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.attr = {.name = "cpu_list", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_type = {
-	.attr = {.name = "type", .mode = 0444 },
-	.show = blk_mq_hw_sysfs_type_show,
-};
 
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_nr_tags.attr,
 	&blk_mq_hw_sysfs_nr_reserved_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
-	&blk_mq_hw_sysfs_type.attr,
 	NULL,
 };
 

From 5aceaeb26394538858a9dbae5830d628469a44cf Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Dec 2018 12:16:26 +0100
Subject: [PATCH 344/351] blk-mq: only dispatch to non-defauly queue maps if
 they have queues

We should check if a given queue map actually has queues enabled before
dispatching to it.  This allows drivers to not initialize optional but
not used map types, which subsequently will allow fixing problems with
queue map rebuilds for that case.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index b63a0de8a07a..d1ed096723fb 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -105,14 +105,17 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 
-	if (q->tag_set->nr_maps > HCTX_TYPE_POLL &&
-	    ((flags & REQ_HIPRI) && test_bit(QUEUE_FLAG_POLL, &q->queue_flags)))
+	if ((flags & REQ_HIPRI) &&
+	    q->tag_set->nr_maps > HCTX_TYPE_POLL && 
+	    q->tag_set->map[HCTX_TYPE_POLL].nr_queues &&
+	    test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		type = HCTX_TYPE_POLL;
 
-	else if (q->tag_set->nr_maps > HCTX_TYPE_READ &&
-		 ((flags & REQ_OP_MASK) == REQ_OP_READ))
+	else if (((flags & REQ_OP_MASK) == REQ_OP_READ) &&
+	         q->tag_set->nr_maps > HCTX_TYPE_READ &&
+		 q->tag_set->map[HCTX_TYPE_READ].nr_queues)
 		type = HCTX_TYPE_READ;
-
+	
 	return blk_mq_map_queue_type(q, type, cpu);
 }
 

From 7e849dd9cf37bc52aff9b5236377c405040c959c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 17 Dec 2018 12:16:27 +0100
Subject: [PATCH 345/351] nvme-pci: don't share queue maps

Now that the block layer checks if a queue map has any queues inside
it there is no more reason to duplicate the maps for the non-default
types.

Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fb9d8270f32c..698b350b38cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -496,11 +496,7 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		map->nr_queues = dev->io_queues[i];
 		if (!map->nr_queues) {
 			BUG_ON(i == HCTX_TYPE_DEFAULT);
-
-			/* shared set, resuse read set parameters */
-			map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
-			qoff = 0;
-			offset = queue_irq_offset(dev);
+			continue;
 		}
 
 		/*

From 7211aef86f79583e59b88a0aba0bc830566f7e8e Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@wdc.com>
Date: Mon, 17 Dec 2018 15:14:05 +0900
Subject: [PATCH 346/351] block: mq-deadline: Fix write completion handling

For a zoned block device using mq-deadline, if a write request for a
zone is received while another write was already dispatched for the same
zone, dd_dispatch_request() will return NULL and the newly inserted
write request is kept in the scheduler queue waiting for the ongoing
zone write to complete. With this behavior, when no other request has
been dispatched, rq_list in blk_mq_sched_dispatch_requests() is empty
and blk_mq_sched_mark_restart_hctx() not called. This in turn leads to
__blk_mq_free_request() call of blk_mq_sched_restart() to not run the
queue when the already dispatched write request completes. The newly
dispatched request stays stuck in the scheduler queue until eventually
another request is submitted.

This problem does not affect SCSI disk as the SCSI stack handles queue
restart on request completion. However, this problem is can be triggered
the nullblk driver with zoned mode enabled.

Fix this by always requesting a queue restart in dd_dispatch_request()
if no request was dispatched while WRITE requests are queued.

Fixes: 5700f69178e9 ("mq-deadline: Introduce zone locking support")
Cc: <stable@vger.kernel.org>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>

Add missing export of blk_mq_sched_restart()

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c |  3 ++-
 block/blk-mq-sched.h |  1 +
 block/mq-deadline.c  | 12 +++++++++++-
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 5b4d52d9cba2..056fa9baf44e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -61,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq)
  * Mark a hardware queue as needing a restart. For shared queues, maintain
  * a count of how many hardware queues are marked for restart.
  */
-static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
+void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 {
 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
 		return;
 
 	set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
+EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
 
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 0f719c8532ae..c7bdb52367ac 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -15,6 +15,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 				struct request **merged_request);
 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1bd06cefce57..14288f864e94 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -373,9 +373,16 @@ done:
 
 /*
  * One confusing aspect here is that we get called for a specific
- * hardware queue, but we return a request that may not be for a
+ * hardware queue, but we may return a request that is for a
  * different hardware queue. This is because mq-deadline has shared
  * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ *
+ * For a zoned block device, __dd_dispatch_request() may return NULL
+ * if all the queued write requests are directed at zones that are already
+ * locked due to on-going write requests. In this case, make sure to mark
+ * the queue as needing a restart to ensure that the queue is run again
+ * and the pending writes dispatched once the target zones for the ongoing
+ * write requests are unlocked in dd_finish_request().
  */
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
@@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 
 	spin_lock(&dd->lock);
 	rq = __dd_dispatch_request(dd);
+	if (!rq && blk_queue_is_zoned(hctx->queue) &&
+	    !list_empty(&dd->fifo_list[WRITE]))
+		blk_mq_sched_mark_restart_hctx(hctx);
 	spin_unlock(&dd->lock);
 
 	return rq;

From c16d6b5a9f47d0e581882269fca1d73be60208b2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 17 Dec 2018 08:44:05 -0700
Subject: [PATCH 347/351] blk-mq: fix dispatch from sw queue

When a request is added to rq list of sw queue(ctx), the rq may be from
a different type of hctx, especially after multi queue mapping is
introduced.

So when dispach request from sw queue via blk_mq_flush_busy_ctxs() or
blk_mq_dequeue_from_ctx(), one request belonging to other queue type of
hctx can be dispatched to current hctx in case that read queue or poll
queue is enabled.

This patch fixes this issue by introducing per-queue-type list.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>

Changed by me to not use separately cacheline aligned lists, just
place them all in the same cacheline where we had just the one list
and lock before.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 65 ++++++++++++++++++++++++------------------
 block/blk-mq-sched.c   | 11 +++++--
 block/blk-mq.c         | 29 ++++++++++++-------
 block/blk-mq.h         |  4 +--
 4 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 1e12033be9ea..90d68760af08 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -652,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m)
 	return 0;
 }
 
-static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos)
-	__acquires(&ctx->lock)
-{
-	struct blk_mq_ctx *ctx = m->private;
-
-	spin_lock(&ctx->lock);
-	return seq_list_start(&ctx->rq_list, *pos);
+#define CTX_RQ_SEQ_OPS(name, type)					\
+static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
+	__acquires(&ctx->lock)						\
+{									\
+	struct blk_mq_ctx *ctx = m->private;				\
+									\
+	spin_lock(&ctx->lock);						\
+	return seq_list_start(&ctx->rq_lists[type], *pos);		\
+}									\
+									\
+static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v,	\
+				     loff_t *pos)			\
+{									\
+	struct blk_mq_ctx *ctx = m->private;				\
+									\
+	return seq_list_next(v, &ctx->rq_lists[type], pos);		\
+}									\
+									\
+static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v)	\
+	__releases(&ctx->lock)						\
+{									\
+	struct blk_mq_ctx *ctx = m->private;				\
+									\
+	spin_unlock(&ctx->lock);					\
+}									\
+									\
+static const struct seq_operations ctx_##name##_rq_list_seq_ops = {	\
+	.start	= ctx_##name##_rq_list_start,				\
+	.next	= ctx_##name##_rq_list_next,				\
+	.stop	= ctx_##name##_rq_list_stop,				\
+	.show	= blk_mq_debugfs_rq_show,				\
 }
 
-static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	struct blk_mq_ctx *ctx = m->private;
+CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
+CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
+CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 
-	return seq_list_next(v, &ctx->rq_list, pos);
-}
-
-static void ctx_rq_list_stop(struct seq_file *m, void *v)
-	__releases(&ctx->lock)
-{
-	struct blk_mq_ctx *ctx = m->private;
-
-	spin_unlock(&ctx->lock);
-}
-
-static const struct seq_operations ctx_rq_list_seq_ops = {
-	.start	= ctx_rq_list_start,
-	.next	= ctx_rq_list_next,
-	.stop	= ctx_rq_list_stop,
-	.show	= blk_mq_debugfs_rq_show,
-};
 static int ctx_dispatched_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_ctx *ctx = data;
@@ -819,7 +826,9 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 };
 
 static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
-	{"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops},
+	{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
+	{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
+	{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
 	{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
 	{"merged", 0600, ctx_merged_show, ctx_merged_write},
 	{"completed", 0600, ctx_completed_show, ctx_completed_write},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 056fa9baf44e..140933e4a7d1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -302,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
  * too much time checking for merges.
  */
 static bool blk_mq_attempt_merge(struct request_queue *q,
+				 struct blk_mq_hw_ctx *hctx,
 				 struct blk_mq_ctx *ctx, struct bio *bio)
 {
+	enum hctx_type type = hctx->type;
+
 	lockdep_assert_held(&ctx->lock);
 
-	if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) {
+	if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
 		ctx->rq_merged++;
 		return true;
 	}
@@ -320,17 +323,19 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu);
 	bool ret = false;
+	enum hctx_type type;
 
 	if (e && e->type->ops.bio_merge) {
 		blk_mq_put_ctx(ctx);
 		return e->type->ops.bio_merge(hctx, bio);
 	}
 
+	type = hctx->type;
 	if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
-			!list_empty_careful(&ctx->rq_list)) {
+			!list_empty_careful(&ctx->rq_lists[type])) {
 		/* default per sw-queue merge */
 		spin_lock(&ctx->lock);
-		ret = blk_mq_attempt_merge(q, ctx, bio);
+		ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
 		spin_unlock(&ctx->lock);
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 313f28b2d079..9c1c1544bac3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -958,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
 	struct flush_busy_ctx_data *flush_data = data;
 	struct blk_mq_hw_ctx *hctx = flush_data->hctx;
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+	enum hctx_type type = hctx->type;
 
 	spin_lock(&ctx->lock);
-	list_splice_tail_init(&ctx->rq_list, flush_data->list);
+	list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
 	sbitmap_clear_bit(sb, bitnr);
 	spin_unlock(&ctx->lock);
 	return true;
@@ -992,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
 	struct dispatch_rq_data *dispatch_data = data;
 	struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
 	struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
+	enum hctx_type type = hctx->type;
 
 	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_list)) {
-		dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
+	if (!list_empty(&ctx->rq_lists[type])) {
+		dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
 		list_del_init(&dispatch_data->rq->queuelist);
-		if (list_empty(&ctx->rq_list))
+		if (list_empty(&ctx->rq_lists[type]))
 			sbitmap_clear_bit(sb, bitnr);
 	}
 	spin_unlock(&ctx->lock);
@@ -1608,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 					    bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	enum hctx_type type = hctx->type;
 
 	lockdep_assert_held(&ctx->lock);
 
 	trace_block_rq_insert(hctx->queue, rq);
 
 	if (at_head)
-		list_add(&rq->queuelist, &ctx->rq_list);
+		list_add(&rq->queuelist, &ctx->rq_lists[type]);
 	else
-		list_add_tail(&rq->queuelist, &ctx->rq_list);
+		list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
 }
 
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
@@ -1651,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 {
 	struct request *rq;
+	enum hctx_type type = hctx->type;
 
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
@@ -1662,7 +1666,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	}
 
 	spin_lock(&ctx->lock);
-	list_splice_tail_init(list, &ctx->rq_list);
+	list_splice_tail_init(list, &ctx->rq_lists[type]);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
 }
@@ -2200,13 +2204,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
+	enum hctx_type type;
 
 	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
+	type = hctx->type;
 
 	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_list)) {
-		list_splice_init(&ctx->rq_list, &tmp);
+	if (!list_empty(&ctx->rq_lists[type])) {
+		list_splice_init(&ctx->rq_lists[type], &tmp);
 		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
@@ -2343,10 +2349,13 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	for_each_possible_cpu(i) {
 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
 		struct blk_mq_hw_ctx *hctx;
+		int k;
 
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
-		INIT_LIST_HEAD(&__ctx->rq_list);
+		for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
+			INIT_LIST_HEAD(&__ctx->rq_lists[k]);
+
 		__ctx->queue = q;
 
 		/*
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d1ed096723fb..d943d46b0785 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -18,8 +18,8 @@ struct blk_mq_ctxs {
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
-		struct list_head	rq_list;
-	}  ____cacheline_aligned_in_smp;
+		struct list_head	rq_lists[HCTX_MAX_TYPES];
+	} ____cacheline_aligned_in_smp;
 
 	unsigned int		cpu;
 	unsigned short		index_hw[HCTX_MAX_TYPES];

From 13369816cb648f897ce9cbf57e55eeb742ce4eb3 Mon Sep 17 00:00:00 2001
From: Dennis Zhou <dennis@kernel.org>
Date: Mon, 17 Dec 2018 11:03:51 -0500
Subject: [PATCH 348/351] block: fix blk-iolatency accounting underflow

The blk-iolatency controller measures the time from rq_qos_throttle() to
rq_qos_done_bio() and attributes this time to the first bio that needs
to create the request. This means if a bio is plug-mergeable or
bio-mergeable, it gets to bypass the blk-iolatency controller.

The recent series [1], to tag all bios w/ blkgs undermined how iolatency
was determining which bios it was charging and should process in
rq_qos_done_bio(). Because all bios are being tagged, this caused the
atomic_t for the struct rq_wait inflight count to underflow and result
in a stall.

This patch adds a new flag BIO_TRACKED to let controllers know that a
bio is going through the rq_qos path. blk-iolatency now checks if this
flag is set to see if it should process the bio in rq_qos_done_bio().

Overloading BLK_QUEUE_ENTERED works, but makes the flag rules confusing.
BIO_THROTTLED was another candidate, but the flag is set for all bios
that have gone through blk-throttle code. Overloading a flag comes with
the burden of making sure that when either implementation changes, a
change in setting rules for one doesn't cause a bug in the other. So
here, we unfortunately opt for adding a new flag.

[1] https://lore.kernel.org/lkml/20181205171039.73066-1-dennis@kernel.org/

Fixes: 5cdf2e3fea5e ("blkcg: associate blkg when associating a device")
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Cc: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c     | 2 +-
 block/blk-rq-qos.h        | 5 +++++
 include/linux/blk_types.h | 1 +
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index bee092727cad..fc714ef402a6 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -593,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 	bool enabled = false;
 
 	blkg = bio->bi_blkg;
-	if (!blkg)
+	if (!blkg || !bio_flagged(bio, BIO_TRACKED))
 		return;
 
 	iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3c85f26d3846..564851889550 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -168,6 +168,11 @@ static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
 
 static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 {
+	/*
+	 * BIO_TRACKED lets controllers know that a bio went through the
+	 * normal rq_qos path.
+	 */
+	bio_set_flag(bio, BIO_TRACKED);
 	if (q->rq_qos)
 		__rq_qos_throttle(q->rq_qos, bio);
 }
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 46c005d601ac..fc99474ac968 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -228,6 +228,7 @@ struct bio {
 #define BIO_TRACE_COMPLETION 10	/* bio_endio() should trace the final completion
 				 * of this bio. */
 #define BIO_QUEUE_ENTERED 11	/* can use blk_queue_enter_live() */
+#define BIO_TRACKED 12		/* set if bio goes through the rq_qos path */
 
 /* See BVEC_POOL_OFFSET below before adding new flags */
 

From e5edd5f298fafda28284bafb8371e6f0b7681035 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 18 Dec 2018 01:28:56 +0800
Subject: [PATCH 349/351] blk-mq: skip zero-queue maps in blk_mq_map_swqueue

From 7e849dd9cf37 ("nvme-pci: don't share queue maps"), the mapping
table won't be initialized actually if map->nr_queues is zero, so
we can't use blk_mq_map_queue_type() to retrieve hctx any more.

This way still may cause broken mapping, fix it by skipping zero-queues
maps in blk_mq_map_swqueue().

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9c1c1544bac3..6847f014606b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2438,6 +2438,9 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
 		for (j = 0; j < set->nr_maps; j++) {
+			if (!set->map[j].nr_queues)
+				continue;
+
 			hctx = blk_mq_map_queue_type(q, j, i);
 
 			/*

From 3c94d83cb352627f221d971b05f163c17527de74 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 17 Dec 2018 21:11:17 -0700
Subject: [PATCH 350/351] blk-mq: change blk_mq_queue_busy() to
 blk_mq_queue_inflight()

There's a single user of this function, dm, and dm just wants
to check if IO is inflight, not that it's just allocated.

This fixes a hang with srp/002 in blktests with dm, where it tries
to suspend but waits for inflight IO to finish first. As it checks
for just allocated requests, this fails.

Tested-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 16 ++++++++--------
 drivers/md/dm.c        |  2 +-
 include/linux/blk-mq.h |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6847f014606b..b0888a89fa66 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -805,14 +805,14 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
-static bool blk_mq_check_busy(struct blk_mq_hw_ctx *hctx, struct request *rq,
-			      void *priv, bool reserved)
+static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			       void *priv, bool reserved)
 {
 	/*
-	 * If we find a request, we know the queue is busy. Return false
-	 * to stop the iteration.
+	 * If we find a request that is inflight and the queue matches,
+	 * we know the queue is busy. Return false to stop the iteration.
 	 */
-	if (rq->q == hctx->queue) {
+	if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
 		bool *busy = priv;
 
 		*busy = true;
@@ -822,14 +822,14 @@ static bool blk_mq_check_busy(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	return true;
 }
 
-bool blk_mq_queue_busy(struct request_queue *q)
+bool blk_mq_queue_inflight(struct request_queue *q)
 {
 	bool busy = false;
 
-	blk_mq_queue_tag_busy_iter(q, blk_mq_check_busy, &busy);
+	blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
 	return busy;
 }
-EXPORT_SYMBOL_GPL(blk_mq_queue_busy);
+EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
 
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index c414d40d645d..dddbca63e140 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -663,7 +663,7 @@ static bool md_in_flight_bios(struct mapped_device *md)
 static bool md_in_flight(struct mapped_device *md)
 {
 	if (queue_is_mq(md->queue))
-		return blk_mq_queue_busy(md->queue);
+		return blk_mq_queue_inflight(md->queue);
 	else
 		return md_in_flight_bios(md);
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 57eda7b20243..d3c0a0d2680b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -257,7 +257,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
-bool blk_mq_queue_busy(struct request_queue *q);
+bool blk_mq_queue_inflight(struct request_queue *q);
 
 enum {
 	/* return when out of requests */

From cd19181bf9ad4b7f40f2a4e0355d052109c76529 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 18 Dec 2018 12:15:29 +0800
Subject: [PATCH 351/351] blk-mq: enable IO poll if .nr_queues of type poll > 0

The queue mapping of type poll only exists when set->map[HCTX_TYPE_POLL].nr_queues
is bigger than zero, so enhance the constraint by checking .nr_queues of type poll
before enabling IO poll.

Otherwise IO race & timeout can be observed when running block/007.

Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c    | 3 ++-
 block/blk-sysfs.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index b0888a89fa66..2de972857496 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2833,7 +2833,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	q->tag_set = set;
 
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
-	if (set->nr_maps > HCTX_TYPE_POLL)
+	if (set->nr_maps > HCTX_TYPE_POLL &&
+	    set->map[HCTX_TYPE_POLL].nr_queues)
 		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index bb7c642ffefa..0619c8922893 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -402,7 +402,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	unsigned long poll_on;
 	ssize_t ret;
 
-	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL)
+	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
+	    !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
 		return -EINVAL;
 
 	ret = queue_var_store(&poll_on, page, count);