From 196cfe2ae8fcdc03b3c7d627e7dfe8c0ce7229f9 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Thu, 14 Jul 2011 15:30:22 +0200
Subject: [PATCH 01/23] xen-blkfront: Drop name and minor adjustments for
 emulated scsi devices

These were intended to avoid the namespace clash when representing
emulated IDE and SCSI devices. However that seems to confuse users
more than expected (a disk defined as sda becomes xvde).
So for now go back to the scheme which does no adjustments. This
will break when mixing IDE and SCSI names in the configuration of
guests but should be by now expected.

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b536a9cef917..238b9419c6d3 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -123,8 +123,8 @@ static DEFINE_SPINLOCK(minor_lock);
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 #define EMULATED_HD_DISK_MINOR_OFFSET (0)
 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
-#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
-#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 

From 89153b5cae9f40c224a5d321665a97bf14220c2c Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Thu, 14 Jul 2011 15:30:37 +0200
Subject: [PATCH 02/23] xen-blkfront: Fix one off warning about name clash

Avoid telling users to use xvde and onwards when using xvde.

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 238b9419c6d3..9ea8c2576c70 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -529,7 +529,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
 		offset = minor / nr_parts;
-		if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+		if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
 			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
 					"emulated IDE disks,\n\t choose an xvd device name"
 					"from xvde on\n", info->vdevice);

From aa387cc895672b00f807ad7c734a2defaf677712 Mon Sep 17 00:00:00 2001
From: Mike Christie <michaelc@cs.wisc.edu>
Date: Sun, 31 Jul 2011 22:05:09 +0200
Subject: [PATCH 03/23] block: add bsg helper library

This moves the FC classes bsg code to the block layer and
makes it a lib so that other classes like iscsi and SAS can use it.

It is helpful because working with the request queue, bios,
creating scatterlists, etc are a pain that the LLD does not
have to worry about with normal IOs and should not have to
worry about for bsg requests.

Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/Kconfig           |  10 ++
 block/Makefile          |   1 +
 block/bsg-lib.c         | 297 ++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h  |   4 +
 include/linux/bsg-lib.h |  73 ++++++++++
 5 files changed, 385 insertions(+)
 create mode 100644 block/bsg-lib.c
 create mode 100644 include/linux/bsg-lib.h

diff --git a/block/Kconfig b/block/Kconfig
index 60be1e0455da..e97934eececa 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -65,6 +65,16 @@ config BLK_DEV_BSG
 
 	  If unsure, say Y.
 
+config BLK_DEV_BSGLIB
+	bool "Block layer SG support v4 helper lib"
+	default n
+	select BLK_DEV_BSG
+	help
+	  Subsystems will normally enable this if needed. Users will not
+	  normally need to manually enable this.
+
+	  If unsure, say N.
+
 config BLK_DEV_INTEGRITY
 	bool "Block layer data integrity support"
 	---help---
diff --git a/block/Makefile b/block/Makefile
index 0fec4b3fab51..514c6e4f427a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)	+= blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
new file mode 100644
index 000000000000..f8c0a61a529c
--- /dev/null
+++ b/block/bsg-lib.c
@@ -0,0 +1,297 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/bsg-lib.h>
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * bsg_destroy_job - routine to teardown/delete a bsg job
+ * @job: bsg_job that is to be torn down
+ */
+static void bsg_destroy_job(struct bsg_job *job)
+{
+	put_device(job->dev);	/* release reference for the request */
+
+	kfree(job->request_payload.sg_list);
+	kfree(job->reply_payload.sg_list);
+	kfree(job);
+}
+
+/**
+ * bsg_job_done - completion routine for bsg requests
+ * @job: bsg_job that is complete
+ * @result: job reply result
+ * @reply_payload_rcv_len: length of payload recvd
+ *
+ * The LLD should call this when the bsg job has completed.
+ */
+void bsg_job_done(struct bsg_job *job, int result,
+		  unsigned int reply_payload_rcv_len)
+{
+	struct request *req = job->req;
+	struct request *rsp = req->next_rq;
+	int err;
+
+	err = job->req->errors = result;
+	if (err < 0)
+		/* we're only returning the result field in the reply */
+		job->req->sense_len = sizeof(u32);
+	else
+		job->req->sense_len = job->reply_len;
+	/* we assume all request payload was transferred, residual == 0 */
+	req->resid_len = 0;
+
+	if (rsp) {
+		WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+
+		/* set reply (bidi) residual */
+		rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+	}
+	blk_complete_request(req);
+}
+EXPORT_SYMBOL_GPL(bsg_job_done);
+
+/**
+ * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * @rq: BSG request that holds the job to be destroyed
+ */
+static void bsg_softirq_done(struct request *rq)
+{
+	struct bsg_job *job = rq->special;
+
+	blk_end_request_all(rq, rq->errors);
+	bsg_destroy_job(job);
+}
+
+static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+	size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+
+	BUG_ON(!req->nr_phys_segments);
+
+	buf->sg_list = kzalloc(sz, GFP_KERNEL);
+	if (!buf->sg_list)
+		return -ENOMEM;
+	sg_init_table(buf->sg_list, req->nr_phys_segments);
+	buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+	buf->payload_len = blk_rq_bytes(req);
+	return 0;
+}
+
+/**
+ * bsg_create_job - create the bsg_job structure for the bsg request
+ * @dev: device that is being sent the bsg request
+ * @req: BSG request that needs a job structure
+ */
+static int bsg_create_job(struct device *dev, struct request *req)
+{
+	struct request *rsp = req->next_rq;
+	struct request_queue *q = req->q;
+	struct bsg_job *job;
+	int ret;
+
+	BUG_ON(req->special);
+
+	job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	req->special = job;
+	job->req = req;
+	if (q->bsg_job_size)
+		job->dd_data = (void *)&job[1];
+	job->request = req->cmd;
+	job->request_len = req->cmd_len;
+	job->reply = req->sense;
+	job->reply_len = SCSI_SENSE_BUFFERSIZE;	/* Size of sense buffer
+						 * allocated */
+	if (req->bio) {
+		ret = bsg_map_buffer(&job->request_payload, req);
+		if (ret)
+			goto failjob_rls_job;
+	}
+	if (rsp && rsp->bio) {
+		ret = bsg_map_buffer(&job->reply_payload, rsp);
+		if (ret)
+			goto failjob_rls_rqst_payload;
+	}
+	job->dev = dev;
+	/* take a reference for the request */
+	get_device(job->dev);
+	return 0;
+
+failjob_rls_rqst_payload:
+	kfree(job->request_payload.sg_list);
+failjob_rls_job:
+	kfree(job);
+	return -ENOMEM;
+}
+
+/*
+ * bsg_goose_queue - restart queue in case it was stopped
+ * @q: request q to be restarted
+ */
+void bsg_goose_queue(struct request_queue *q)
+{
+	if (!q)
+		return;
+
+	blk_run_queue_async(q);
+}
+EXPORT_SYMBOL_GPL(bsg_goose_queue);
+
+/**
+ * bsg_request_fn - generic handler for bsg requests
+ * @q: request queue to manage
+ *
+ * On error the create_bsg_job function should return a -Exyz error value
+ * that will be set to the req->errors.
+ *
+ * Drivers/subsys should pass this to the queue init function.
+ */
+void bsg_request_fn(struct request_queue *q)
+{
+	struct device *dev = q->queuedata;
+	struct request *req;
+	struct bsg_job *job;
+	int ret;
+
+	if (!get_device(dev))
+		return;
+
+	while (1) {
+		req = blk_fetch_request(q);
+		if (!req)
+			break;
+		spin_unlock_irq(q->queue_lock);
+
+		ret = bsg_create_job(dev, req);
+		if (ret) {
+			req->errors = ret;
+			blk_end_request_all(req, ret);
+			spin_lock_irq(q->queue_lock);
+			continue;
+		}
+
+		job = req->special;
+		ret = q->bsg_job_fn(job);
+		spin_lock_irq(q->queue_lock);
+		if (ret)
+			break;
+	}
+
+	spin_unlock_irq(q->queue_lock);
+	put_device(dev);
+	spin_lock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(bsg_request_fn);
+
+/**
+ * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
+ * @dev: device to attach bsg device to
+ * @q: request queue setup by caller
+ * @name: device to give bsg device
+ * @job_fn: bsg job handler
+ * @dd_job_size: size of LLD data needed for each job
+ *
+ * The caller should have setup the reuqest queue with bsg_request_fn
+ * as the request_fn.
+ */
+int bsg_setup_queue(struct device *dev, struct request_queue *q,
+		    char *name, bsg_job_fn *job_fn, int dd_job_size)
+{
+	int ret;
+
+	q->queuedata = dev;
+	q->bsg_job_size = dd_job_size;
+	q->bsg_job_fn = job_fn;
+	queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+	blk_queue_softirq_done(q, bsg_softirq_done);
+	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+
+	ret = bsg_register_queue(q, dev, name, NULL);
+	if (ret) {
+		printk(KERN_ERR "%s: bsg interface failed to "
+		       "initialize - register queue\n", dev->kobj.name);
+		return ret;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bsg_setup_queue);
+
+/**
+ * bsg_remove_queue - Deletes the bsg dev from the q
+ * @q:	the request_queue that is to be torn down.
+ *
+ * Notes:
+ *   Before unregistering the queue empty any requests that are blocked
+ */
+void bsg_remove_queue(struct request_queue *q)
+{
+	struct request *req; /* block request */
+	int counts; /* totals for request_list count and starved */
+
+	if (!q)
+		return;
+
+	/* Stop taking in new requests */
+	spin_lock_irq(q->queue_lock);
+	blk_stop_queue(q);
+
+	/* drain all requests in the queue */
+	while (1) {
+		/* need the lock to fetch a request
+		 * this may fetch the same reqeust as the previous pass
+		 */
+		req = blk_fetch_request(q);
+		/* save requests in use and starved */
+		counts = q->rq.count[0] + q->rq.count[1] +
+			 q->rq.starved[0] + q->rq.starved[1];
+		spin_unlock_irq(q->queue_lock);
+		/* any requests still outstanding? */
+		if (counts == 0)
+			break;
+
+		/* This may be the same req as the previous iteration,
+		 * always send the blk_end_request_all after a prefetch.
+		 * It is not okay to not end the request because the
+		 * prefetch started the request.
+		 */
+		if (req) {
+			/* return -ENXIO to indicate that this queue is
+			 * going away
+			 */
+			req->errors = -ENXIO;
+			blk_end_request_all(req, -ENXIO);
+		}
+
+		msleep(200); /* allow bsg to possibly finish */
+		spin_lock_irq(q->queue_lock);
+	}
+	bsg_unregister_queue(q);
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0e67c45b3bc9..847928546076 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -30,6 +30,7 @@ struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
+struct bsg_job;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -209,6 +210,7 @@ typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
+typedef int (bsg_job_fn) (struct bsg_job *);
 
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
@@ -375,6 +377,8 @@ struct request_queue {
 	struct mutex		sysfs_lock;
 
 #if defined(CONFIG_BLK_DEV_BSG)
+	bsg_job_fn		*bsg_job_fn;
+	int			bsg_job_size;
 	struct bsg_class_device bsg_dev;
 #endif
 
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
new file mode 100644
index 000000000000..f55ab8cdc106
--- /dev/null
+++ b/include/linux/bsg-lib.h
@@ -0,0 +1,73 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#ifndef _BLK_BSG_
+#define _BLK_BSG_
+
+#include <linux/blkdev.h>
+
+struct request;
+struct device;
+struct scatterlist;
+struct request_queue;
+
+struct bsg_buffer {
+	unsigned int payload_len;
+	int sg_cnt;
+	struct scatterlist *sg_list;
+};
+
+struct bsg_job {
+	struct device *dev;
+	struct request *req;
+
+	/* Transport/driver specific request/reply structs */
+	void *request;
+	void *reply;
+
+	unsigned int request_len;
+	unsigned int reply_len;
+	/*
+	 * On entry : reply_len indicates the buffer size allocated for
+	 * the reply.
+	 *
+	 * Upon completion : the message handler must set reply_len
+	 *  to indicates the size of the reply to be returned to the
+	 *  caller.
+	 */
+
+	/* DMA payloads for the request/response */
+	struct bsg_buffer request_payload;
+	struct bsg_buffer reply_payload;
+
+	void *dd_data;		/* Used for driver-specific storage */
+};
+
+void bsg_job_done(struct bsg_job *job, int result,
+		  unsigned int reply_payload_rcv_len);
+int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
+		    bsg_job_fn *job_fn, int dd_job_size);
+void bsg_request_fn(struct request_queue *q);
+void bsg_remove_queue(struct request_queue *q);
+void bsg_goose_queue(struct request_queue *q);
+
+#endif

From 34dd82afd27da2537199d7f71f1542501c6f96e7 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 31 Jul 2011 22:08:04 +0200
Subject: [PATCH 04/23] loop: replace linked list of allocated devices with an
 idr index

Replace the linked list, that keeps track of allocated devices, with an
idr index to allow a more efficient lookup of devices.

Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 152 +++++++++++++++++++++++--------------------
 include/linux/loop.h |   1 -
 2 files changed, 80 insertions(+), 73 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 76c8da78212b..f58532e77777 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -78,8 +78,8 @@
 
 #include <asm/uaccess.h>
 
-static LIST_HEAD(loop_devices);
-static DEFINE_MUTEX(loop_devices_mutex);
+static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_index_mutex);
 
 static int max_part;
 static int part_shift;
@@ -722,17 +722,10 @@ static inline int is_loop_device(struct file *file)
 static ssize_t loop_attr_show(struct device *dev, char *page,
 			      ssize_t (*callback)(struct loop_device *, char *))
 {
-	struct loop_device *l, *lo = NULL;
+	struct gendisk *disk = dev_to_disk(dev);
+	struct loop_device *lo = disk->private_data;
 
-	mutex_lock(&loop_devices_mutex);
-	list_for_each_entry(l, &loop_devices, lo_list)
-		if (disk_to_dev(l->lo_disk) == dev) {
-			lo = l;
-			break;
-		}
-	mutex_unlock(&loop_devices_mutex);
-
-	return lo ? callback(lo, page) : -EIO;
+	return callback(lo, page);
 }
 
 #define LOOP_ATTR_RO(_name)						\
@@ -1557,40 +1550,64 @@ int loop_register_transfer(struct loop_func_table *funcs)
 	return 0;
 }
 
+static int unregister_transfer_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
+	struct loop_func_table *xfer = data;
+
+	mutex_lock(&lo->lo_ctl_mutex);
+	if (lo->lo_encryption == xfer)
+		loop_release_xfer(lo);
+	mutex_unlock(&lo->lo_ctl_mutex);
+	return 0;
+}
+
 int loop_unregister_transfer(int number)
 {
 	unsigned int n = number;
-	struct loop_device *lo;
 	struct loop_func_table *xfer;
 
 	if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
 		return -EINVAL;
 
 	xfer_funcs[n] = NULL;
-
-	list_for_each_entry(lo, &loop_devices, lo_list) {
-		mutex_lock(&lo->lo_ctl_mutex);
-
-		if (lo->lo_encryption == xfer)
-			loop_release_xfer(lo);
-
-		mutex_unlock(&lo->lo_ctl_mutex);
-	}
-
+	idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
 	return 0;
 }
 
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
-static struct loop_device *loop_alloc(int i)
+static int loop_add(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
 	struct gendisk *disk;
+	int err;
 
 	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
-	if (!lo)
+	if (!lo) {
+		err = -ENOMEM;
 		goto out;
+	}
+
+	err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
+	if (err < 0)
+		goto out_free_dev;
+
+	if (i >= 0) {
+		int m;
+
+		/* create specific i in the index */
+		err = idr_get_new_above(&loop_index_idr, lo, i, &m);
+		if (err >= 0 && i != m) {
+			idr_remove(&loop_index_idr, m);
+			err = -EEXIST;
+		}
+	} else {
+		err = -EINVAL;
+	}
+	if (err < 0)
+		goto out_free_dev;
 
 	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!lo->lo_queue)
@@ -1611,56 +1628,54 @@ static struct loop_device *loop_alloc(int i)
 	disk->private_data	= lo;
 	disk->queue		= lo->lo_queue;
 	sprintf(disk->disk_name, "loop%d", i);
-	return lo;
+	add_disk(disk);
+	*l = lo;
+	return lo->lo_number;
 
 out_free_queue:
 	blk_cleanup_queue(lo->lo_queue);
 out_free_dev:
 	kfree(lo);
 out:
-	return NULL;
+	return err;
 }
 
-static void loop_free(struct loop_device *lo)
+static void loop_remove(struct loop_device *lo)
 {
+	del_gendisk(lo->lo_disk);
 	blk_cleanup_queue(lo->lo_queue);
 	put_disk(lo->lo_disk);
-	list_del(&lo->lo_list);
 	kfree(lo);
 }
 
-static struct loop_device *loop_init_one(int i)
+static int loop_lookup(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
+	int ret = -ENODEV;
 
-	list_for_each_entry(lo, &loop_devices, lo_list) {
-		if (lo->lo_number == i)
-			return lo;
-	}
-
-	lo = loop_alloc(i);
+	lo = idr_find(&loop_index_idr, i);
 	if (lo) {
-		add_disk(lo->lo_disk);
-		list_add_tail(&lo->lo_list, &loop_devices);
+		*l = lo;
+		ret = lo->lo_number;
 	}
-	return lo;
-}
-
-static void loop_del_one(struct loop_device *lo)
-{
-	del_gendisk(lo->lo_disk);
-	loop_free(lo);
+	return ret;
 }
 
 static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 {
 	struct loop_device *lo;
 	struct kobject *kobj;
+	int err;
 
-	mutex_lock(&loop_devices_mutex);
-	lo = loop_init_one(MINOR(dev) >> part_shift);
-	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
-	mutex_unlock(&loop_devices_mutex);
+	mutex_lock(&loop_index_mutex);
+	err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+	if (err < 0)
+		err = loop_add(&lo, MINOR(dev) >> part_shift);
+	if (err < 0)
+		kobj = ERR_PTR(err);
+	else
+		kobj = get_disk(lo->lo_disk);
+	mutex_unlock(&loop_index_mutex);
 
 	*part = 0;
 	return kobj;
@@ -1670,7 +1685,7 @@ static int __init loop_init(void)
 {
 	int i, nr;
 	unsigned long range;
-	struct loop_device *lo, *next;
+	struct loop_device *lo;
 
 	/*
 	 * loop module now has a feature to instantiate underlying device
@@ -1719,43 +1734,36 @@ static int __init loop_init(void)
 	if (register_blkdev(LOOP_MAJOR, "loop"))
 		return -EIO;
 
-	for (i = 0; i < nr; i++) {
-		lo = loop_alloc(i);
-		if (!lo)
-			goto Enomem;
-		list_add_tail(&lo->lo_list, &loop_devices);
-	}
-
-	/* point of no return */
-
-	list_for_each_entry(lo, &loop_devices, lo_list)
-		add_disk(lo->lo_disk);
-
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
+	/* pre-create number devices of devices given by config or max_loop */
+	mutex_lock(&loop_index_mutex);
+	for (i = 0; i < nr; i++)
+		loop_add(&lo, i);
+	mutex_unlock(&loop_index_mutex);
+
 	printk(KERN_INFO "loop: module loaded\n");
 	return 0;
+}
 
-Enomem:
-	printk(KERN_INFO "loop: out of memory\n");
+static int loop_exit_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
 
-	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-		loop_free(lo);
-
-	unregister_blkdev(LOOP_MAJOR, "loop");
-	return -ENOMEM;
+	loop_remove(lo);
+	return 0;
 }
 
 static void __exit loop_exit(void)
 {
 	unsigned long range;
-	struct loop_device *lo, *next;
 
 	range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
 
-	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-		loop_del_one(lo);
+	idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+	idr_remove_all(&loop_index_idr);
+	idr_destroy(&loop_index_idr);
 
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 66c194e2d9b9..5f08d18fa148 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -64,7 +64,6 @@ struct loop_device {
 
 	struct request_queue	*lo_queue;
 	struct gendisk		*lo_disk;
-	struct list_head	lo_list;
 };
 
 #endif /* __KERNEL__ */

From 770fe30a46a12b6fb6b63fbe1737654d28e84844 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 31 Jul 2011 22:08:04 +0200
Subject: [PATCH 05/23] loop: add management interface for on-demand device
 allocation

Loop devices today have a fixed pre-allocated number of usually 8.
The number can only be changed at module init time. To find a free
device to use, /dev/loop%i needs to be scanned, and all devices need
to be opened until a free one is possibly found.

This adds a new /dev/loop-control device node, that allows to
dynamically find or allocate a free device, and to add and remove loop
devices from the running system:
 LOOP_CTL_ADD adds a specific device. Arg is the number
 of the device. It returns the device i or a negative
 error code.

 LOOP_CTL_REMOVE removes a specific device, Arg is the
 number the device. It returns the device i or a negative
 error code.

 LOOP_CTL_GET_FREE finds the next unbound device or allocates
 a new one. No arg is given. It returns the device i or a
 negative error code.

The loop kernel module gets automatically loaded when
/dev/loop-control is accessed the first time. The alias
specified in the module, instructs udev to create this
'dead' device node, even when the module is not loaded.

Example:
 cfd = open("/dev/loop-control", O_RDWR);

 # add a new specific loop device
 err = ioctl(cfd, LOOP_CTL_ADD, devnr);

 # remove a specific loop device
 err = ioctl(cfd, LOOP_CTL_REMOVE, devnr);

 # find or allocate a free loop device to use
 devnr = ioctl(cfd, LOOP_CTL_GET_FREE);

 sprintf(loopname, "/dev/loop%i", devnr);
 ffd = open("backing-file", O_RDWR);
 lfd = open(loopname, O_RDWR);
 err = ioctl(lfd, LOOP_SET_FD, ffd);

Cc: Tejun Heo <tj@kernel.org>
Cc: Karel Zak  <kzak@redhat.com>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c       | 120 +++++++++++++++++++++++++++++++++++--
 include/linux/loop.h       |   4 ++
 include/linux/miscdevice.h |   1 +
 3 files changed, 121 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f58532e77777..5c9edf944879 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -75,7 +75,7 @@
 #include <linux/kthread.h>
 #include <linux/splice.h>
 #include <linux/sysfs.h>
-
+#include <linux/miscdevice.h>
 #include <asm/uaccess.h>
 
 static DEFINE_IDR(loop_index_idr);
@@ -1478,13 +1478,22 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
-	struct loop_device *lo = bdev->bd_disk->private_data;
+	struct loop_device *lo;
+	int err = 0;
+
+	mutex_lock(&loop_index_mutex);
+	lo = bdev->bd_disk->private_data;
+	if (!lo) {
+		err = -ENXIO;
+		goto out;
+	}
 
 	mutex_lock(&lo->lo_ctl_mutex);
 	lo->lo_refcnt++;
 	mutex_unlock(&lo->lo_ctl_mutex);
-
-	return 0;
+out:
+	mutex_unlock(&loop_index_mutex);
+	return err;
 }
 
 static int lo_release(struct gendisk *disk, fmode_t mode)
@@ -1603,6 +1612,13 @@ static int loop_add(struct loop_device **l, int i)
 			idr_remove(&loop_index_idr, m);
 			err = -EEXIST;
 		}
+	} else if (i == -1) {
+		int m;
+
+		/* get next free nr */
+		err = idr_get_new(&loop_index_idr, lo, &m);
+		if (err >= 0)
+			i = m;
 	} else {
 		err = -EINVAL;
 	}
@@ -1648,16 +1664,41 @@ static void loop_remove(struct loop_device *lo)
 	kfree(lo);
 }
 
+static int find_free_cb(int id, void *ptr, void *data)
+{
+	struct loop_device *lo = ptr;
+	struct loop_device **l = data;
+
+	if (lo->lo_state == Lo_unbound) {
+		*l = lo;
+		return 1;
+	}
+	return 0;
+}
+
 static int loop_lookup(struct loop_device **l, int i)
 {
 	struct loop_device *lo;
 	int ret = -ENODEV;
 
+	if (i < 0) {
+		int err;
+
+		err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+		if (err == 1) {
+			*l = lo;
+			ret = lo->lo_number;
+		}
+		goto out;
+	}
+
+	/* lookup and return a specific i */
 	lo = idr_find(&loop_index_idr, i);
 	if (lo) {
 		*l = lo;
 		ret = lo->lo_number;
 	}
+out:
 	return ret;
 }
 
@@ -1681,11 +1722,76 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 	return kobj;
 }
 
+static long loop_control_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long parm)
+{
+	struct loop_device *lo;
+	int ret = -ENOSYS;
+
+	mutex_lock(&loop_index_mutex);
+	switch (cmd) {
+	case LOOP_CTL_ADD:
+		ret = loop_lookup(&lo, parm);
+		if (ret >= 0) {
+			ret = -EEXIST;
+			break;
+		}
+		ret = loop_add(&lo, parm);
+		break;
+	case LOOP_CTL_REMOVE:
+		ret = loop_lookup(&lo, parm);
+		if (ret < 0)
+			break;
+		mutex_lock(&lo->lo_ctl_mutex);
+		if (lo->lo_state != Lo_unbound) {
+			ret = -EBUSY;
+			mutex_unlock(&lo->lo_ctl_mutex);
+			break;
+		}
+		if (lo->lo_refcnt > 0) {
+			ret = -EBUSY;
+			mutex_unlock(&lo->lo_ctl_mutex);
+			break;
+		}
+		lo->lo_disk->private_data = NULL;
+		mutex_unlock(&lo->lo_ctl_mutex);
+		idr_remove(&loop_index_idr, lo->lo_number);
+		loop_remove(lo);
+		break;
+	case LOOP_CTL_GET_FREE:
+		ret = loop_lookup(&lo, -1);
+		if (ret >= 0)
+			break;
+		ret = loop_add(&lo, -1);
+	}
+	mutex_unlock(&loop_index_mutex);
+
+	return ret;
+}
+
+static const struct file_operations loop_ctl_fops = {
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= loop_control_ioctl,
+	.compat_ioctl	= loop_control_ioctl,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice loop_misc = {
+	.minor		= LOOP_CTRL_MINOR,
+	.name		= "loop-control",
+	.fops		= &loop_ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+MODULE_ALIAS("devname:loop-control");
+
 static int __init loop_init(void)
 {
 	int i, nr;
 	unsigned long range;
 	struct loop_device *lo;
+	int err;
 
 	/*
 	 * loop module now has a feature to instantiate underlying device
@@ -1702,6 +1808,10 @@ static int __init loop_init(void)
 	 *     device on-demand.
 	 */
 
+	err = misc_register(&loop_misc);
+	if (err < 0)
+		return err;
+
 	part_shift = 0;
 	if (max_part > 0) {
 		part_shift = fls(max_part);
@@ -1767,6 +1877,8 @@ static void __exit loop_exit(void)
 
 	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
 	unregister_blkdev(LOOP_MAJOR, "loop");
+
+	misc_deregister(&loop_misc);
 }
 
 module_init(loop_init);
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 5f08d18fa148..683d69890119 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -160,4 +160,8 @@ int loop_unregister_transfer(int number);
 #define LOOP_CHANGE_FD		0x4C06
 #define LOOP_SET_CAPACITY	0x4C07
 
+/* /dev/loop-control interface */
+#define LOOP_CTL_ADD		0x4C80
+#define LOOP_CTL_REMOVE		0x4C81
+#define LOOP_CTL_GET_FREE	0x4C82
 #endif
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index 18fd13028ba1..c309b1ecdc1c 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -40,6 +40,7 @@
 #define BTRFS_MINOR		234
 #define AUTOFS_MINOR		235
 #define MAPPER_CTRL_MINOR	236
+#define LOOP_CTRL_MINOR		237
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;

From d134b00b9acca3fb054d7c88a5f5d562ecbb42d1 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 31 Jul 2011 22:08:04 +0200
Subject: [PATCH 06/23] loop: add BLK_DEV_LOOP_MIN_COUNT=%i to allow distros 0
 pre-allocated loop devices

Instead of unconditionally creating a fixed number of dead loop
devices which need to be investigated by storage handling services,
even when they are never used, we allow distros start with 0
loop devices and have losetup(8) and similar switch to the dynamic
/dev/loop-control interface instead of searching /dev/loop%i for free
devices.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/kernel-parameters.txt |  9 ++++++---
 drivers/block/Kconfig               | 15 +++++++++++++++
 drivers/block/loop.c                | 27 ++++++++++-----------------
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4ca93898fbd3..c32851131646 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1340,9 +1340,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			it is equivalent to "nosmp", which also disables
 			the IO APIC.
 
-	max_loop=	[LOOP] Maximum number of loopback devices that can
-			be mounted
-			Format: <1-256>
+	max_loop=	[LOOP] The number of loop block devices that get
+	(loop.max_loop)	unconditionally pre-created at init time. The default
+			number is configured by BLK_DEV_LOOP_MIN_COUNT. Instead
+			of statically allocating a predefined number, loop
+			devices can be requested on-demand with the
+			/dev/loop-control interface.
 
 	mcatest=	[IA-64]
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 717d6e4e18d3..57212c5235e2 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -256,6 +256,21 @@ config BLK_DEV_LOOP
 
 	  Most users will answer N here.
 
+config BLK_DEV_LOOP_MIN_COUNT
+	int "Number of loop devices to pre-create at init time"
+	depends on BLK_DEV_LOOP
+	default 8
+	help
+	  Static number of loop devices to be unconditionally pre-created
+	  at init time.
+
+	  This default value can be overwritten on the kernel command
+	  line or with module-parameter loop.max_loop.
+
+	  The historic default is 8. If a late 2011 version of losetup(8)
+	  is used, it can be set to 0, since needed loop devices can be
+	  dynamically allocated with the /dev/loop-control interface.
+
 config BLK_DEV_CRYPTOLOOP
 	tristate "Cryptoloop Support"
 	select CRYPTO
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5c9edf944879..3defc52f060c 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1793,21 +1793,6 @@ static int __init loop_init(void)
 	struct loop_device *lo;
 	int err;
 
-	/*
-	 * loop module now has a feature to instantiate underlying device
-	 * structure on-demand, provided that there is an access dev node.
-	 * However, this will not work well with user space tool that doesn't
-	 * know about such "feature".  In order to not break any existing
-	 * tool, we do the following:
-	 *
-	 * (1) if max_loop is specified, create that many upfront, and this
-	 *     also becomes a hard limit.
-	 * (2) if max_loop is not specified, create 8 loop device on module
-	 *     load, user can further extend loop device by create dev node
-	 *     themselves and have kernel automatically instantiate actual
-	 *     device on-demand.
-	 */
-
 	err = misc_register(&loop_misc);
 	if (err < 0)
 		return err;
@@ -1833,11 +1818,19 @@ static int __init loop_init(void)
 	if (max_loop > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 
+	/*
+	 * If max_loop is specified, create that many devices upfront.
+	 * This also becomes a hard limit. If max_loop is not specified,
+	 * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+	 * init time. Loop devices can be requested on-demand with the
+	 * /dev/loop-control interface, or be instantiated by accessing
+	 * a 'dead' device node.
+	 */
 	if (max_loop) {
 		nr = max_loop;
 		range = max_loop << part_shift;
 	} else {
-		nr = 8;
+		nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
 		range = 1UL << MINORBITS;
 	}
 
@@ -1847,7 +1840,7 @@ static int __init loop_init(void)
 	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 				  THIS_MODULE, loop_probe, NULL, NULL);
 
-	/* pre-create number devices of devices given by config or max_loop */
+	/* pre-create number of devices given by config or max_loop */
 	mutex_lock(&loop_index_mutex);
 	for (i = 0; i < nr; i++)
 		loop_add(&lo, i);

From 05eb0f252b04aa94ace0794f73d56c6a02351d80 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Sun, 31 Jul 2011 22:21:35 +0200
Subject: [PATCH 07/23] loop: fix deadlock when sysfs and LOOP_CLR_FD race
 against each other

LOOP_CLR_FD takes lo->lo_ctl_mutex and tries to remove the loop sysfs
files. Sysfs calls show() and waits for lo->lo_ctl_mutex. LOOP_CLR_FD
waits for show() to finish to remove the sysfs file.

  cat /sys/class/block/loop0/loop/backing_file
    mutex_lock_nested+0x176/0x350
    ? loop_attr_do_show_backing_file+0x2f/0xd0 [loop]
    ? loop_attr_do_show_backing_file+0x2f/0xd0 [loop]
    loop_attr_do_show_backing_file+0x2f/0xd0 [loop]
    dev_attr_show+0x1b/0x60
    ? sysfs_read_file+0x86/0x1a0
    ? __get_free_pages+0x12/0x50
    sysfs_read_file+0xaf/0x1a0

  ioctl(LOOP_CLR_FD):
    wait_for_common+0x12c/0x180
    ? try_to_wake_up+0x2a0/0x2a0
    wait_for_completion+0x18/0x20
    sysfs_deactivate+0x178/0x180
    ? sysfs_addrm_finish+0x43/0x70
    ? sysfs_addrm_start+0x1d/0x20
    sysfs_addrm_finish+0x43/0x70
    sysfs_hash_and_remove+0x85/0xa0
    sysfs_remove_group+0x59/0x100
    loop_clr_fd+0x1dc/0x3f0 [loop]
    lo_ioctl+0x223/0x7a0 [loop]

Instead of taking the lo_ctl_mutex from sysfs code, take the inner
lo->lo_lock, to protect the access to the backing_file data.

Thanks to Tejun for help debugging and finding a solution.

Cc: Milan Broz <mbroz@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Cc: stable@kernel.org
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/loop.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3defc52f060c..4720c7ade0ae 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -743,10 +743,10 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
 	ssize_t ret;
 	char *p = NULL;
 
-	mutex_lock(&lo->lo_ctl_mutex);
+	spin_lock_irq(&lo->lo_lock);
 	if (lo->lo_backing_file)
 		p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
-	mutex_unlock(&lo->lo_ctl_mutex);
+	spin_unlock_irq(&lo->lo_lock);
 
 	if (IS_ERR_OR_NULL(p))
 		ret = PTR_ERR(p);
@@ -1000,7 +1000,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
 	kthread_stop(lo->lo_thread);
 
+	spin_lock_irq(&lo->lo_lock);
 	lo->lo_backing_file = NULL;
+	spin_unlock_irq(&lo->lo_lock);
 
 	loop_release_xfer(lo);
 	lo->transfer = NULL;

From e5a94f56845bb4b272d82e84b5a1e2080b07ba82 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Mon, 1 Aug 2011 10:31:06 +0200
Subject: [PATCH 08/23] blk-throttle: correctly determine sync bio

read request is always sync. Using rw_is_sync() to determine
if a bio is sync.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-throttle.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f6a794120505..a19f58c6fc3a 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -746,7 +746,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
-	bool sync = bio->bi_rw & REQ_SYNC;
+	bool sync = rw_is_sync(bio->bi_rw);
 
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
@@ -1150,7 +1150,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 
 		if (tg_no_rule_group(tg, rw)) {
 			blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-					rw, bio->bi_rw & REQ_SYNC);
+					rw, rw_is_sync(bio->bi_rw));
 			rcu_read_unlock();
 			return 0;
 		}

From a5395b83b78f62ccf5e3af854aacd025c2a6e7b5 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 2 Aug 2011 09:24:09 +0200
Subject: [PATCH 09/23] cfq-iosched: Reduce linked group count upon group
 destruction

FQ keeps track of number of groups which are linked on blkcg->blkg_list.
This is useful to avoid races between queue exit and cgroup exit code
paths. So if at the request queue exit time linked group count is not
zero, that means there are some group out there which is yet to be
deleted under rcu read period and queue exit code should wait for
on rcu period.

In my previous patch I forgot to decrease the number of group count.
So in current form, we nr_blkcg_linked_grps is always non-zero and
we will always wait one rcu period (if BLK_CGROUP=y). The side effect
of this is that it can increase boot time. I am surprised, nobody
complained so far.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 1f96ad6254f1..650834537606 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1209,6 +1209,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
 	hlist_del_init(&cfqg->cfqd_node);
 
+	BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+	cfqd->nr_blkcg_linked_grps--;
+
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.

From e2a5429ff7947ad251310376384f449297b7492a Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Tue, 2 Aug 2011 10:43:35 +0200
Subject: [PATCH 10/23] bsg-lib: add module.h include

Due to conflicts with the moduleh tree in linux-next, we
run into an include file mess. We really need export.h
in that tree, but if we add module.h locally then the
issue is easier to resolve.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/bsg-lib.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index f8c0a61a529c..6690e6e41037 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -25,6 +25,7 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/bsg-lib.h>
+#include <linux/module.h>
 #include <scsi/scsi_cmnd.h>
 
 /**

From ddad9ef5826efdfbbdb67b13b46f30e43e46ec3e Mon Sep 17 00:00:00 2001
From: H Hartley Sweeten <hartleys@visionengravers.com>
Date: Tue, 2 Aug 2011 12:43:49 +0200
Subject: [PATCH 11/23] drivers/block/drbd/drbd_nl.c: use bitmap_parse instead
 of __bitmap_parse

The buffer 'sc.cpu_mask' is a kernel buffer.  If bitmap_parse is used
instead of __bitmap_parse the extra parameter that indicates a kernel
buffer is not needed.

Signed-off-by: H Hartley Sweeten <hsweeten@visionengravers.com>
Cc: Lars Ellenberg <drbd-dev@lists.linbit.com>
Cc: Philipp Reisner <philipp.reisner@linbit.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/drbd/drbd_nl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 515bcd948a43..0feab261e295 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1829,10 +1829,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 
 	/* silently ignore cpu mask on UP kernel */
 	if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
-		err = __bitmap_parse(sc.cpu_mask, 32, 0,
+		err = bitmap_parse(sc.cpu_mask, 32,
 				cpumask_bits(new_cpu_mask), nr_cpu_ids);
 		if (err) {
-			dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+			dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
 			retcode = ERR_CPU_MASK_PARSE;
 			goto fail;
 		}

From aec9f377e4f235c47e27fd8a429555dfa2dda342 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 2 Aug 2011 12:43:50 +0200
Subject: [PATCH 12/23] drivers/cdrom/cdrom.c: relax check on dvd manufacturer
 value

The report has an ISO which has a very long manufacturer ID.  It seems
that Linux is wrong, not the ISO maker.

Relax the check for the length of this field: emit a warning and truncate
the incoming data to 2048 bytes rather than rejecting the entire thing.

dvd_manufact.value isn't null-terminated.  I'm not even sure if it's a
string.  The kernel doesn't apepar to use it anyway.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=39062

Reported-by: <ale.goujon@gmail.com>
Tested-by: <ale.goujon@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/cdrom/cdrom.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index 75fb965b8f72..f997c27d79e2 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -1929,11 +1929,17 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
 		goto out;
 
 	s->manufact.len = buf[0] << 8 | buf[1];
-	if (s->manufact.len < 0 || s->manufact.len > 2048) {
+	if (s->manufact.len < 0) {
 		cdinfo(CD_WARNING, "Received invalid manufacture info length"
 				   " (%d)\n", s->manufact.len);
 		ret = -EIO;
 	} else {
+		if (s->manufact.len > 2048) {
+			cdinfo(CD_WARNING, "Received invalid manufacture info "
+					"length (%d): truncating to 2048\n",
+					s->manufact.len);
+			s->manufact.len = 2048;
+		}
 		memcpy(s->manufact.value, &buf[4], s->manufact.len);
 	}
 

From f95fe9cfb49f6e625fbb5888cae2ed6f3a276b89 Mon Sep 17 00:00:00 2001
From: Herbert Poetzl <herbert@13thfloor.at>
Date: Tue, 2 Aug 2011 12:43:50 +0200
Subject: [PATCH 13/23] block/genhd.c: remove useless cast in diskstats_show()

Remove the (unsigned long long) cast in diskstats_show() and adjusts the
seq_printf() format string to 'unsigned long'

diskstats_show() uses part_stat_read() to get the stats, which either
accesses the specified field in the struct disk_stats directly (non SMP)
or sums up the per CPU values in a variable of the same type as the field,
so in any case the result will have the same type and range as the
specified field which for all disk_stats entries is unsigned long

Also, for unsigned long ranges the output of %lu should be identical to
the one of %llu, so no change in the actual proc entry contents.

Signed-off-by: Herbert Poetzl <herbert@13thfloor.at>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/genhd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 5cb51c55f6d8..e2f67902dd02 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1146,17 +1146,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 		cpu = part_stat_lock();
 		part_round_stats(cpu, hd);
 		part_stat_unlock();
-		seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
-			   "%u %lu %lu %llu %u %u %u %u\n",
+		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
+			   "%u %lu %lu %lu %u %u %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[READ]),
 			   part_stat_read(hd, merges[READ]),
-			   (unsigned long long)part_stat_read(hd, sectors[READ]),
+			   part_stat_read(hd, sectors[READ]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
 			   part_stat_read(hd, ios[WRITE]),
 			   part_stat_read(hd, merges[WRITE]),
-			   (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+			   part_stat_read(hd, sectors[WRITE]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
 			   part_in_flight(hd),
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),

From f41c53a569c4cf0556893ec9cfcf697d069799e1 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@gmail.com>
Date: Wed, 3 Aug 2011 15:02:55 +0200
Subject: [PATCH 14/23] block: swim3: fix unterminated of_device_id table

of_device_id structures need a NULL terminating entry, add it.

Signed-off-by: Axel Lin <axel.lin@gmail.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 drivers/block/swim3.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 773bfa792777..ae3e167e17ad 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1184,6 +1184,7 @@ static struct of_device_id swim3_match[] =
 	{
 	.compatible	= "swim3"
 	},
+	{ /* end of list */ }
 };
 
 static struct macio_driver swim3_driver =

From 35ae66e0a09ab70ed588e65f26b4c725cd1656b6 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Fri, 5 Aug 2011 09:37:10 +0200
Subject: [PATCH 15/23] block: Make rq_affinity = 1 work as expected

Commit 5757a6d76c introduced a new rq_affinity = 2 so as to make
the request completed in the __make_request cpu. But it makes the
old rq_affinity = 1 not work any more. The root cause is that
if the 'cpu' and 'req->cpu' is in the same group and cpu != req->cpu,
ccpu will be the same as group_cpu, so the completion will be
excuted in the 'cpu' not 'group_cpu'.

This patch fix problem by simpling removing group_cpu and the codes
are more explicit now. If ccpu == cpu, we complete in cpu, otherwise
we raise_blk_irq to ccpu.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
Reviewed-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-softirq.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 475fab809a80..487addc85bb5 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
-	int ccpu, cpu, group_cpu = NR_CPUS;
+	int ccpu, cpu;
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
@@ -117,14 +117,12 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
 			ccpu = blk_cpu_to_group(ccpu);
-			group_cpu = blk_cpu_to_group(cpu);
-		}
 	} else
 		ccpu = cpu;
 
-	if (ccpu == cpu || ccpu == group_cpu) {
+	if (ccpu == cpu) {
 		struct list_head *list;
 do_local:
 		list = &__get_cpu_var(blk_cpu_done);

From 4931402a9dd00b2997e95bfbb89409b2a6dbb383 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Fri, 5 Aug 2011 09:42:20 +0200
Subject: [PATCH 16/23] cfq-iosched: Add documentation about idling

There are always questions about why CFQ is idling on various conditions.
Recent ones is Christoph asking again why to idle on REQ_NOIDLE. His
assertion is that XFS is relying more and more on workqueues and is
concerned that CFQ idling on IO from every workqueue will impact
XFS badly.

So he suggested that I add some more documentation about CFQ idling
and that can provide more clarity on the topic and also gives an
opprotunity to poke a hole in theory and lead to improvements.

So here is my attempt at that. Any comments are welcome.

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 Documentation/block/cfq-iosched.txt | 71 +++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/Documentation/block/cfq-iosched.txt b/Documentation/block/cfq-iosched.txt
index e578feed6d81..6d670f570451 100644
--- a/Documentation/block/cfq-iosched.txt
+++ b/Documentation/block/cfq-iosched.txt
@@ -43,3 +43,74 @@ If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
 to IOPS mode and starts providing fairness in terms of number of requests
 dispatched. Note that this mode switching takes effect only for group
 scheduling. For non-cgroup users nothing should change.
+
+CFQ IO scheduler Idling Theory
+===============================
+Idling on a queue is primarily about waiting for the next request to come
+on same queue after completion of a request. In this process CFQ will not
+dispatch requests from other cfq queues even if requests are pending there.
+
+The rationale behind idling is that it can cut down on number of seeks
+on rotational media. For example, if a process is doing dependent
+sequential reads (next read will come on only after completion of previous
+one), then not dispatching request from other queue should help as we
+did not move the disk head and kept on dispatching sequential IO from
+one queue.
+
+CFQ has following service trees and various queues are put on these trees.
+
+	sync-idle	sync-noidle	async
+
+All cfq queues doing synchronous sequential IO go on to sync-idle tree.
+On this tree we idle on each queue individually.
+
+All synchronous non-sequential queues go on sync-noidle tree. Also any
+request which are marked with REQ_NOIDLE go on this service tree. On this
+tree we do not idle on individual queues instead idle on the whole group
+of queues or the tree. So if there are 4 queues waiting for IO to dispatch
+we will idle only once last queue has dispatched the IO and there is
+no more IO on this service tree.
+
+All async writes go on async service tree. There is no idling on async
+queues.
+
+CFQ has some optimizations for SSDs and if it detects a non-rotational
+media which can support higher queue depth (multiple requests at in
+flight at a time), then it cuts down on idling of individual queues and
+all the queues move to sync-noidle tree and only tree idle remains. This
+tree idling provides isolation with buffered write queues on async tree.
+
+FAQ
+===
+Q1. Why to idle at all on queues marked with REQ_NOIDLE.
+
+A1. We only do tree idle (all queues on sync-noidle tree) on queues marked
+    with REQ_NOIDLE. This helps in providing isolation with all the sync-idle
+    queues. Otherwise in presence of many sequential readers, other
+    synchronous IO might not get fair share of disk.
+
+    For example, if there are 10 sequential readers doing IO and they get
+    100ms each. If a REQ_NOIDLE request comes in, it will be scheduled
+    roughly after 1 second. If after completion of REQ_NOIDLE request we
+    do not idle, and after a couple of milli seconds a another REQ_NOIDLE
+    request comes in, again it will be scheduled after 1second. Repeat it
+    and notice how a workload can lose its disk share and suffer due to
+    multiple sequential readers.
+
+    fsync can generate dependent IO where bunch of data is written in the
+    context of fsync, and later some journaling data is written. Journaling
+    data comes in only after fsync has finished its IO (atleast for ext4
+    that seemed to be the case). Now if one decides not to idle on fsync
+    thread due to REQ_NOIDLE, then next journaling write will not get
+    scheduled for another second. A process doing small fsync, will suffer
+    badly in presence of multiple sequential readers.
+
+    Hence doing tree idling on threads using REQ_NOIDLE flag on requests
+    provides isolation from multiple sequential readers and at the same
+    time we do not idle on individual threads.
+
+Q2. When to specify REQ_NOIDLE
+A2. I would think whenever one is doing synchronous write and not expecting
+    more writes to be dispatched from same context soon, should be able
+    to specify REQ_NOIDLE on writes and that probably should work well for
+    most of the cases.

From ea5e116162b7e0cf83a2b8a273440514404604de Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Wed, 3 Aug 2011 11:12:17 -0400
Subject: [PATCH 17/23] xen/blkback: Make description more obvious.

With the frontend having Xen but the backend not, it just looks odd:

  <*>   Xen virtual block device support
  <*>   Block-device backend driver

Fix it to have the 'Xen' in front of it.

Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 drivers/block/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 717d6e4e18d3..a89ebf1b28aa 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -471,7 +471,7 @@ config XEN_BLKDEV_FRONTEND
 	  in another domain which drives the actual block device.
 
 config XEN_BLKDEV_BACKEND
-	tristate "Block-device backend driver"
+	tristate "Xen block-device backend driver"
 	depends on XEN_BACKEND
 	help
 	  The block-device backend driver allows the kernel to export its

From fa1bf42ff9296ac4cf211b0a1b450a6071d26a95 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Tue, 9 Aug 2011 20:32:09 +0200
Subject: [PATCH 18/23] allow blk_flush_policy to return REQ_FSEQ_DATA
 independent of *FLUSH

blk_insert_flush has the following check:

	/*
	 * If there's data but flush is not necessary, the request can be
	 * processed directly without going through flush machinery.  Queue
	 * for normal execution.
	 */
	if ((policy & REQ_FSEQ_DATA) &&
	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
		list_add_tail(&rq->queuelist, &q->queue_head);
		return;
	}

However, blk_flush_policy will not return with policy set to only
REQ_FSEQ_DATA:

static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
{
	unsigned int policy = 0;

	if (fflags & REQ_FLUSH) {
		if (rq->cmd_flags & REQ_FLUSH)
			policy |= REQ_FSEQ_PREFLUSH;
		if (blk_rq_sectors(rq))
			policy |= REQ_FSEQ_DATA;
		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
			policy |= REQ_FSEQ_POSTFLUSH;
	}
	return policy;
}

Notice that REQ_FSEQ_DATA is only set if REQ_FLUSH is set.  Fix this
mismatch by moving the setting of REQ_FSEQ_DATA outside of the REQ_FLUSH
check.

Tejun notes:

  Hmmm... yes, this can become a correctness issue if (and only if)
  blk_queue_flush() is called to change q->flush_flags while requests
  are in-flight; otherwise, requests wouldn't reach the function at all.
  Also, I think it would be a generally good idea to always set
  FSEQ_DATA if the request has data.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-flush.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index bb21e4c36f70..2d162bd840d3 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
 	unsigned int policy = 0;
 
+	if (blk_rq_sectors(rq))
+		policy |= REQ_FSEQ_DATA;
+
 	if (fflags & REQ_FLUSH) {
 		if (rq->cmd_flags & REQ_FLUSH)
 			policy |= REQ_FSEQ_PREFLUSH;
-		if (blk_rq_sectors(rq))
-			policy |= REQ_FSEQ_DATA;
 		if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
 			policy |= REQ_FSEQ_POSTFLUSH;
 	}

From 8e4bf84474960e832b56293c9b0674c88b5b05ce Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Thu, 11 Aug 2011 10:36:03 +0200
Subject: [PATCH 19/23] Move some REQ flags to the common bio/request area

REQ_SECURE, REQ_FLUSH and REQ_FUA may all be set on a bio as well as
on a request, so relocate them to the shared part of the enum.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blk_types.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 6395692b2e7a..32f0076e844b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -125,7 +125,11 @@ enum rq_flag_bits {
 	__REQ_SYNC,		/* request is sync (sync write or read) */
 	__REQ_META,		/* metadata io request */
 	__REQ_DISCARD,		/* request to discard sectors */
+	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
+
 	__REQ_NOIDLE,		/* don't anticipate more IO after this one */
+	__REQ_FUA,		/* forced unit access */
+	__REQ_FLUSH,		/* request for cache flush */
 
 	/* bio only flags */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
@@ -135,7 +139,6 @@ enum rq_flag_bits {
 	/* request only flags */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
-	__REQ_FUA,		/* forced unit access */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
 	__REQ_DONTPREP,		/* don't call prep for this one */
@@ -146,11 +149,9 @@ enum rq_flag_bits {
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_COPY_USER,	/* contains copies of user pages */
-	__REQ_FLUSH,		/* request for cache flush */
 	__REQ_FLUSH_SEQ,	/* request for flush sequence */
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
-	__REQ_SECURE,		/* secure discard (used with __REQ_DISCARD) */
 	__REQ_NR_BITS,		/* stops here */
 };
 

From c09c47caedc9854d59378d6e34c989e51cfdd2b4 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 11 Aug 2011 10:36:05 +0200
Subject: [PATCH 20/23] blktrace: add FLUSH/FUA support

Add FLUSH/FUA support to blktrace. As FLUSH precedes WRITE and/or
FUA follows WRITE, use the same 'F' flag for both cases and
distinguish them by their (relative) position. The end results
look like (other flags might be shown also):

 - WRITE:            W
 - WRITE_FLUSH:      FW
 - WRITE_FUA:        WF
 - WRITE_FLUSH_FUA:  FWF

Note that we reuse TC_BARRIER due to lack of bit space of act_mask
so that the older versions of blktrace tools will report flush
requests as barriers from now on.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 include/linux/blktrace_api.h |  5 +++--
 include/trace/events/block.h | 20 +++++++++++---------
 kernel/trace/blktrace.c      | 21 ++++++++++++++++-----
 3 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 8c7c2de7631a..8e9e4bc6d73b 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -14,7 +14,7 @@
 enum blktrace_cat {
 	BLK_TC_READ	= 1 << 0,	/* reads */
 	BLK_TC_WRITE	= 1 << 1,	/* writes */
-	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_FLUSH	= 1 << 2,	/* flush */
 	BLK_TC_SYNC	= 1 << 3,	/* sync IO */
 	BLK_TC_SYNCIO	= BLK_TC_SYNC,
 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
@@ -28,8 +28,9 @@ enum blktrace_cat {
 	BLK_TC_META	= 1 << 12,	/* metadata */
 	BLK_TC_DISCARD	= 1 << 13,	/* discard requests */
 	BLK_TC_DRV_DATA	= 1 << 14,	/* binary per-driver data */
+	BLK_TC_FUA	= 1 << 15,	/* fua requests */
 
-	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+	BLK_TC_END	= 1 << 15,	/* we've run out of bits! */
 };
 
 #define BLK_TC_SHIFT		(16)
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index bf366547da25..05c5e61f0a7c 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -8,6 +8,8 @@
 #include <linux/blkdev.h>
 #include <linux/tracepoint.h>
 
+#define RWBS_LEN	8
+
 DECLARE_EVENT_CLASS(block_rq_with_error,
 
 	TP_PROTO(struct request_queue *q, struct request *rq),
@@ -19,7 +21,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
-		__array(  char,		rwbs,	6		)
+		__array(  char,		rwbs,	RWBS_LEN	)
 		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
 	),
 
@@ -104,7 +106,7 @@ DECLARE_EVENT_CLASS(block_rq,
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
 		__field(  unsigned int,	bytes			)
-		__array(  char,		rwbs,	6		)
+		__array(  char,		rwbs,	RWBS_LEN	)
 		__array(  char,         comm,   TASK_COMM_LEN   )
 		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
 	),
@@ -183,7 +185,7 @@ TRACE_EVENT(block_bio_bounce,
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
-		__array( char,		rwbs,	6		)
+		__array( char,		rwbs,	RWBS_LEN	)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 
@@ -222,7 +224,7 @@ TRACE_EVENT(block_bio_complete,
 		__field( sector_t,	sector		)
 		__field( unsigned,	nr_sector	)
 		__field( int,		error		)
-		__array( char,		rwbs,	6	)
+		__array( char,		rwbs,	RWBS_LEN)
 	),
 
 	TP_fast_assign(
@@ -249,7 +251,7 @@ DECLARE_EVENT_CLASS(block_bio,
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
-		__array( char,		rwbs,	6		)
+		__array( char,		rwbs,	RWBS_LEN	)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 
@@ -321,7 +323,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
-		__array( char,		rwbs,	6		)
+		__array( char,		rwbs,	RWBS_LEN	)
 		__array( char,		comm,	TASK_COMM_LEN	)
         ),
 
@@ -456,7 +458,7 @@ TRACE_EVENT(block_split,
 		__field( dev_t,		dev				)
 		__field( sector_t,	sector				)
 		__field( sector_t,	new_sector			)
-		__array( char,		rwbs,		6		)
+		__array( char,		rwbs,		RWBS_LEN	)
 		__array( char,		comm,		TASK_COMM_LEN	)
 	),
 
@@ -498,7 +500,7 @@ TRACE_EVENT(block_bio_remap,
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
-		__array( char,		rwbs,	6	)
+		__array( char,		rwbs,	RWBS_LEN)
 	),
 
 	TP_fast_assign(
@@ -542,7 +544,7 @@ TRACE_EVENT(block_rq_remap,
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
-		__array( char,		rwbs,	6	)
+		__array( char,		rwbs,	RWBS_LEN)
 	),
 
 	TP_fast_assign(
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6957aa298dfa..7c910a5593a6 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
+	what |= MASK_TC_BIT(rw, FLUSH);
+	what |= MASK_TC_BIT(rw, FUA);
 
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 		goto out;
 	}
 
+	if (tc & BLK_TC_FLUSH)
+		rwbs[i++] = 'F';
+
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
 	else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	else
 		rwbs[i++] = 'N';
 
+	if (tc & BLK_TC_FUA)
+		rwbs[i++] = 'F';
 	if (tc & BLK_TC_AHEAD)
 		rwbs[i++] = 'A';
-	if (tc & BLK_TC_BARRIER)
-		rwbs[i++] = 'B';
 	if (tc & BLK_TC_SYNC)
 		rwbs[i++] = 'S';
 	if (tc & BLK_TC_META)
@@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 
 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
-	char rwbs[6];
+	char rwbs[RWBS_LEN];
 	unsigned long long ts  = iter->ts;
 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
@@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 
 static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
-	char rwbs[6];
+	char rwbs[RWBS_LEN];
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
@@ -1561,7 +1566,7 @@ static const struct {
 } mask_maps[] = {
 	{ BLK_TC_READ,		"read"		},
 	{ BLK_TC_WRITE,		"write"		},
-	{ BLK_TC_BARRIER,	"barrier"	},
+	{ BLK_TC_FLUSH,		"flush"		},
 	{ BLK_TC_SYNC,		"sync"		},
 	{ BLK_TC_QUEUE,		"queue"		},
 	{ BLK_TC_REQUEUE,	"requeue"	},
@@ -1573,6 +1578,7 @@ static const struct {
 	{ BLK_TC_META,		"meta"		},
 	{ BLK_TC_DISCARD,	"discard"	},
 	{ BLK_TC_DRV_DATA,	"drv_data"	},
+	{ BLK_TC_FUA,		"fua"		},
 };
 
 static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 {
 	int i = 0;
 
+	if (rw & REQ_FLUSH)
+		rwbs[i++] = 'F';
+
 	if (rw & WRITE)
 		rwbs[i++] = 'W';
 	else if (rw & REQ_DISCARD)
@@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 	else
 		rwbs[i++] = 'N';
 
+	if (rw & REQ_FUA)
+		rwbs[i++] = 'F';
 	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
 	if (rw & REQ_SYNC)

From bcf30e75b773b60379338768677a1301ef602ff9 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Thu, 11 Aug 2011 10:39:04 +0200
Subject: [PATCH 21/23] block: improve rq_affinity placement

This patch reverts commit 35ae66e0a09ab70ed(block: Make rq_affinity = 1
work as expected). The purpose is to avoid an unnecessary IPI.
Let's take an example. My test box has cpu 0-7, one socket. Say request is
added from CPU 1, blk_complete_request() occurs at CPU 7. Without the reverted
patch, softirq will be done at CPU 7. With it, an IPI will be directed to CPU
0, and softirq will be done at CPU 0. In this case, doing softirq at CPU 0 and
CPU 7 have no difference from cache sharing point view and we can avoid an
ipi if doing it in CPU 7.
An immediate concern is this is just like QUEUE_FLAG_SAME_FORCE, but actually
not. blk_complete_request() is running in interrupt handler, and currently
I/O controller doesn't support multiple interrupts (I checked several LSI
cards and AHCI), so only one CPU can run blk_complete_request(). This is
still quite different as QUEUE_FLAG_SAME_FORCE.
Since only one CPU runs softirq, the only difference with below patch is
softirq not always runs at the first CPU of a group.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-softirq.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 487addc85bb5..58340d0cb23a 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -103,7 +103,7 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = {
 
 void __blk_complete_request(struct request *req)
 {
-	int ccpu, cpu;
+	int ccpu, cpu, group_cpu = NR_CPUS;
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
@@ -117,12 +117,22 @@ void __blk_complete_request(struct request *req)
 	 */
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
 		ccpu = req->cpu;
-		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
+		if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
 			ccpu = blk_cpu_to_group(ccpu);
+			group_cpu = blk_cpu_to_group(cpu);
+		}
 	} else
 		ccpu = cpu;
 
-	if (ccpu == cpu) {
+	/*
+	 * If current CPU and requested CPU are in the same group, running
+	 * softirq in current CPU. One might concern this is just like
+	 * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
+	 * running in interrupt handler, and currently I/O controller doesn't
+	 * support multiple interrupts, so current CPU is unique actually. This
+	 * avoids IPI sending from current CPU to the first CPU of a group.
+	 */
+	if (ccpu == cpu || ccpu == group_cpu) {
 		struct list_head *list;
 do_local:
 		list = &__get_cpu_var(blk_cpu_done);

From 4853abaae7e4a2af938115ce9071ef8684fb7af4 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Mon, 15 Aug 2011 21:37:25 +0200
Subject: [PATCH 22/23] block: fix flush machinery for stacking drivers with
 differring flush flags

Commit ae1b1539622fb46e51b4d13b3f9e5f4c713f86ae, block: reimplement
FLUSH/FUA to support merge, introduced a performance regression when
running any sort of fsyncing workload using dm-multipath and certain
storage (in our case, an HP EVA).  The test I ran was fs_mark, and it
dropped from ~800 files/sec on ext4 to ~100 files/sec.  It turns out
that dm-multipath always advertised flush+fua support, and passed
commands on down the stack, where those flags used to get stripped off.
The above commit changed that behavior:

static inline struct request *__elv_next_request(struct request_queue *q)
{
        struct request *rq;

        while (1) {
-               while (!list_empty(&q->queue_head)) {
+               if (!list_empty(&q->queue_head)) {
                        rq = list_entry_rq(q->queue_head.next);
-                       if (!(rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) ||
-                           (rq->cmd_flags & REQ_FLUSH_SEQ))
-                               return rq;
-                       rq = blk_do_flush(q, rq);
-                       if (rq)
-                               return rq;
+                       return rq;
                }

Note that previously, a command would come in here, have
REQ_FLUSH|REQ_FUA set, and then get handed off to blk_do_flush:

struct request *blk_do_flush(struct request_queue *q, struct request *rq)
{
        unsigned int fflags = q->flush_flags; /* may change, cache it */
        bool has_flush = fflags & REQ_FLUSH, has_fua = fflags & REQ_FUA;
        bool do_preflush = has_flush && (rq->cmd_flags & REQ_FLUSH);
        bool do_postflush = has_flush && !has_fua && (rq->cmd_flags &
        REQ_FUA);
        unsigned skip = 0;
...
        if (blk_rq_sectors(rq) && !do_preflush && !do_postflush) {
                rq->cmd_flags &= ~REQ_FLUSH;
		if (!has_fua)
			rq->cmd_flags &= ~REQ_FUA;
	        return rq;
	}

So, the flush machinery was bypassed in such cases (q->flush_flags == 0
&& rq->cmd_flags & (REQ_FLUSH|REQ_FUA)).

Now, however, we don't get into the flush machinery at all.  Instead,
__elv_next_request just hands a request with flush and fua bits set to
the scsi_request_fn, even if the underlying request_queue does not
support flush or fua.

The agreed upon approach is to fix the flush machinery to allow
stacking.  While this isn't used in practice (since there is only one
request-based dm target, and that target will now reflect the flush
flags of the underlying device), it does future-proof the solution, and
make it function as designed.

In order to make this work, I had to add a field to the struct request,
inside the flush structure (to store the original req->end_io).  Shaohua
had suggested overloading the union with rb_node and completion_data,
but the completion data is used by device mapper and can also be used by
other drivers.  So, I didn't see a way around the additional field.

I tested this patch on an HP EVA with both ext4 and xfs, and it recovers
the lost performance.  Comments and other testers, as always, are
appreciated.

Cheers,
Jeff

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/blk-core.c       |  8 ++++++--
 block/blk-flush.c      | 20 ++++++++++++++++----
 block/blk.h            |  2 ++
 include/linux/blkdev.h |  1 +
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index b850bedad229..7c59b0f5eae8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1700,6 +1700,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
+	int where = ELEVATOR_INSERT_BACK;
 
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
@@ -1716,7 +1717,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	 */
 	BUG_ON(blk_queued_rq(rq));
 
-	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+	if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+		where = ELEVATOR_INSERT_FLUSH;
+
+	add_acct_request(q, rq, where);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
 	return 0;
@@ -2273,7 +2277,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+bool __blk_end_bidi_request(struct request *rq, int error,
 				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 2d162bd840d3..491eb30a242d 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -123,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq)
 
 	/* make @rq a normal request */
 	rq->cmd_flags &= ~REQ_FLUSH_SEQ;
-	rq->end_io = NULL;
+	rq->end_io = rq->flush.saved_end_io;
 }
 
 /**
@@ -301,9 +301,6 @@ void blk_insert_flush(struct request *rq)
 	unsigned int fflags = q->flush_flags;	/* may change, cache */
 	unsigned int policy = blk_flush_policy(fflags, rq);
 
-	BUG_ON(rq->end_io);
-	BUG_ON(!rq->bio || rq->bio != rq->biotail);
-
 	/*
 	 * @policy now records what operations need to be done.  Adjust
 	 * REQ_FLUSH and FUA for the driver.
@@ -312,6 +309,19 @@ void blk_insert_flush(struct request *rq)
 	if (!(fflags & REQ_FUA))
 		rq->cmd_flags &= ~REQ_FUA;
 
+	/*
+	 * An empty flush handed down from a stacking driver may
+	 * translate into nothing if the underlying device does not
+	 * advertise a write-back cache.  In this case, simply
+	 * complete the request.
+	 */
+	if (!policy) {
+		__blk_end_bidi_request(rq, 0, 0, 0);
+		return;
+	}
+
+	BUG_ON(!rq->bio || rq->bio != rq->biotail);
+
 	/*
 	 * If there's data but flush is not necessary, the request can be
 	 * processed directly without going through flush machinery.  Queue
@@ -320,6 +330,7 @@ void blk_insert_flush(struct request *rq)
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
 		list_add_tail(&rq->queuelist, &q->queue_head);
+		blk_run_queue_async(q);
 		return;
 	}
 
@@ -330,6 +341,7 @@ void blk_insert_flush(struct request *rq)
 	memset(&rq->flush, 0, sizeof(rq->flush));
 	INIT_LIST_HEAD(&rq->flush.list);
 	rq->cmd_flags |= REQ_FLUSH_SEQ;
+	rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
 	rq->end_io = flush_data_end_io;
 
 	blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
diff --git a/block/blk.h b/block/blk.h
index d6586287adc9..20b900a377c9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes);
 
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 847928546076..84b15d54f8c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -118,6 +118,7 @@ struct request {
 		struct {
 			unsigned int		seq;
 			struct list_head	list;
+			rq_end_io_fn		*saved_end_io;
 		} flush;
 	};
 

From b53d1ed734a2b9af8da115b836b658daa7d47a48 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jaxboe@fusionio.com>
Date: Fri, 19 Aug 2011 08:34:48 +0200
Subject: [PATCH 23/23] Revert "cfq: Remove special treatment for metadata
 rqs."

We have a kernel build regression since 3.1-rc1, which is about 10%
regression. The kernel source is in an ext3 filesystem.
Alex Shi bisect it to commit:
commit a07405b7802691d29ab3b23bdc76ee6d006aad0b
Author: Justin TerAvest <teravest@google.com>
Date:   Sun Jul 10 22:09:19 2011 +0200

    cfq: Remove special treatment for metadata rqs.

Apparently this is caused by lack metadata preemption, where ext3/ext4
do use READ_META. I didn't see a way to fix the issue, so suggest
reverting the patch.

This reverts commit a07405b7802691d29ab3b23bdc76ee6d006aad0b.

Reported-by: Alex Shi<alex.shi@intel.com>
Reported-by: Shaohua Li<shaohua.li@intel.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 block/cfq-iosched.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 650834537606..a33bd4377c61 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -130,6 +130,8 @@ struct cfq_queue {
 	unsigned long slice_end;
 	long slice_resid;
 
+	/* pending metadata requests */
+	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 
@@ -682,6 +684,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
 	if (rq_is_sync(rq1) != rq_is_sync(rq2))
 		return rq_is_sync(rq1) ? rq1 : rq2;
 
+	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+		return rq1->cmd_flags & REQ_META ? rq1 : rq2;
+
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 
@@ -1607,6 +1612,10 @@ static void cfq_remove_request(struct request *rq)
 	cfqq->cfqd->rq_queued--;
 	cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
 					rq_data_dir(rq), rq_is_sync(rq));
+	if (rq->cmd_flags & REQ_META) {
+		WARN_ON(!cfqq->meta_pending);
+		cfqq->meta_pending--;
+	}
 }
 
 static int cfq_merge(struct request_queue *q, struct request **req,
@@ -3359,6 +3368,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 
+	/*
+	 * So both queues are sync. Let the new request get disk time if
+	 * it's a metadata request and the current queue is doing regular IO.
+	 */
+	if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+		return true;
+
 	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
@@ -3423,6 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_io_context *cic = RQ_CIC(rq);
 
 	cfqd->rq_queued++;
+	if (rq->cmd_flags & REQ_META)
+		cfqq->meta_pending++;
 
 	cfq_update_io_thinktime(cfqd, cfqq, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);