A set of device-mapper fixes for 3.12.

A few fixes for dm-snapshot, a 32 bit fix for dm-stats, a couple error handling fixes for dm-multipath. A fix for the thin provisioning target to not expose non-zero discard limits if discards are disabled. Lastly, add two DM module parameters which allow users to tune the emergency memory reserves that DM mainatins per device -- this helps fix a long-standing issue for dm-multipath. The conservative default reserve for request-based dm-multipath devices (256) has proven problematic for users with many multipathed SCSI devices but relatively little memory. To responsibly select a smaller value users should use the new nr_bios tracepoint info (via commit 75afb352 "block: Add nr_bios to block_rq_remap tracepoint") to determine the peak number of bios their workloads create. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1.4.14 (GNU/Linux) iQEcBAABAgAGBQJSQMVHAAoJEMUj8QotnQNaOXgIAJS6/XJKMoHfiDJ9M+XD34rZ Uyr9TEnubX3DKCRBiY23MUcCQn3fx6BjCGv5/c8L4jQFIuLyDi2yatqpwXcbGSJh G/S/y6u0Axek+ew7TS80OFop4nblW6MoKnoh9/4N55Ofa+1WvKM4ERUGjHGbauyS TxmLQPToCFPLYRIOZ+imd6hQuIZ1+FFdJFvi7kY9O6Llx2sLD6fWi1iruBd/Da2H ByMX3biGN45mSpcBzRbSC/FkJ9CRIvT9n82BDPS0o3Tllt8NaVlEDaovB7h4ncc0 bFuT2Z3Q38B9uZ8Lj0bqdGzv3kXMLCkLo6WhWjyUt84hmDPAzRpBwt60jUqWyZs= =bjVp -----END PGP SIGNATURE----- Merge tag 'dm-3.12-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm Pull device-mapper fixes from Mike Snitzer: "A few fixes for dm-snapshot, a 32 bit fix for dm-stats, a couple error handling fixes for dm-multipath. A fix for the thin provisioning target to not expose non-zero discard limits if discards are disabled. Lastly, add two DM module parameters which allow users to tune the emergency memory reserves that DM mainatins per device -- this helps fix a long-standing issue for dm-multipath. The conservative default reserve for request-based dm-multipath devices (256) has proven problematic for users with many multipathed SCSI devices but relatively little memory. To responsibly select a smaller value users should use the new nr_bios tracepoint info (via commit 75afb352 "block: Add nr_bios to block_rq_remap tracepoint") to determine the peak number of bios their workloads create" * tag 'dm-3.12-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: dm: add reserved_bio_based_ios module parameter dm: add reserved_rq_based_ios module parameter dm: lower bio-based mempool reservation dm thin: do not expose non-zero discard limits if discards disabled dm mpath: disable WRITE SAME if it fails dm-snapshot: fix performance degradation due to small hash size dm snapshot: workaround for a false positive lockdep warning dm stats: fix possible counter corruption on 32-bit systems dm mpath: do not fail path on -ENOSPC
2013-09-25 15:12:46 -07:00 · 2013-09-25 15:12:46 -07:00 · e93dd910b9
commit e93dd910b9
parent b4820416dd e8603136cb
9 changed files with 120 additions and 26 deletions
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@ -19,8 +19,6 @@
 #define DM_MSG_PREFIX "io"

 #define DM_IO_MAX_REGIONS	BITS_PER_LONG
-#define MIN_IOS		16
-#define MIN_BIOS	16

 struct dm_io_client {
 	mempool_t *pool;
@ -50,16 +48,17 @@ static struct kmem_cache *_dm_io_cache;
 struct dm_io_client *dm_io_client_create(void)
 {
 	struct dm_io_client *client;
+	unsigned min_ios = dm_get_reserved_bio_based_ios();

 	client = kmalloc(sizeof(*client), GFP_KERNEL);
 	if (!client)
 		return ERR_PTR(-ENOMEM);

-	client->pool = mempool_create_slab_pool(MIN_IOS, _dm_io_cache);
+	client->pool = mempool_create_slab_pool(min_ios, _dm_io_cache);
 	if (!client->pool)
 		goto bad;

-	client->bios = bioset_create(MIN_BIOS, 0);
+	client->bios = bioset_create(min_ios, 0);
 	if (!client->bios)
 		goto bad;

--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@ -7,6 +7,7 @@

 #include <linux/device-mapper.h>

+#include "dm.h"
 #include "dm-path-selector.h"
 #include "dm-uevent.h"

@ -116,8 +117,6 @@ struct dm_mpath_io {

 typedef int (*action_fn) (struct pgpath *pgpath);

-#define MIN_IOS 256	/* Mempool size */
-
 static struct kmem_cache *_mpio_cache;

 static struct workqueue_struct *kmultipathd, *kmpath_handlerd;
@ -190,6 +189,7 @@ static void free_priority_group(struct priority_group *pg,
 static struct multipath *alloc_multipath(struct dm_target *ti)
 {
 	struct multipath *m;
+	unsigned min_ios = dm_get_reserved_rq_based_ios();

 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
@ -202,7 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		INIT_WORK(&m->trigger_event, trigger_event);
 		init_waitqueue_head(&m->pg_init_wait);
 		mutex_init(&m->work_mutex);
-		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
+		m->mpio_pool = mempool_create_slab_pool(min_ios, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
 			return NULL;
@ -1268,6 +1268,7 @@ static int noretry_error(int error)
 	case -EREMOTEIO:
 	case -EILSEQ:
 	case -ENODATA:
+	case -ENOSPC:
 		return 1;
 	}

@ -1298,8 +1299,17 @@ static int do_end_io(struct multipath *m, struct request *clone,
 	if (!error && !clone->errors)
 		return 0;	/* I/O complete */

-	if (noretry_error(error))
+	if (noretry_error(error)) {
+		if ((clone->cmd_flags & REQ_WRITE_SAME) &&
+		    !clone->q->limits.max_write_same_sectors) {
+			struct queue_limits *limits;
+
+			/* device doesn't really support WRITE SAME, disable it */
+			limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
+			limits->max_write_same_sectors = 0;
+		}
 		return error;
+	}

 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@ -256,7 +256,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
 	 */
 	INIT_WORK_ONSTACK(&req.work, do_metadata);
 	queue_work(ps->metadata_wq, &req.work);
-	flush_work(&req.work);
+	flush_workqueue(ps->metadata_wq);

 	return req.result;
 }
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@ -725,17 +725,16 @@ static int calc_max_buckets(void)
 */
 static int init_hash_tables(struct dm_snapshot *s)
 {
-	sector_t hash_size, cow_dev_size, origin_dev_size, max_buckets;
+	sector_t hash_size, cow_dev_size, max_buckets;

 	/*
 	 * Calculate based on the size of the original volume or
 	 * the COW volume...
 	 */
 	cow_dev_size = get_dev_size(s->cow->bdev);
-	origin_dev_size = get_dev_size(s->origin->bdev);
 	max_buckets = calc_max_buckets();

-	hash_size = min(origin_dev_size, cow_dev_size) >> s->store->chunk_shift;
+	hash_size = cow_dev_size >> s->store->chunk_shift;
 	hash_size = min(hash_size, max_buckets);

 	if (hash_size < 64)
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@ -451,19 +451,26 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 	struct dm_stat_percpu *p;

 	/*
-	 * For strict correctness we should use local_irq_disable/enable
+	 * For strict correctness we should use local_irq_save/restore
 	 * instead of preempt_disable/enable.
 	 *
-	 * This is racy if the driver finishes bios from non-interrupt
-	 * context as well as from interrupt context or from more different
-	 * interrupts.
+	 * preempt_disable/enable is racy if the driver finishes bios
+	 * from non-interrupt context as well as from interrupt context
+	 * or from more different interrupts.
 	 *
-	 * However, the race only results in not counting some events,
-	 * so it is acceptable.
+	 * On 64-bit architectures the race only results in not counting some
+	 * events, so it is acceptable.  On 32-bit architectures the race could
+	 * cause the counter going off by 2^32, so we need to do proper locking
+	 * there.
 	 *
 	 * part_stat_lock()/part_stat_unlock() have this race too.
 	 */
+#if BITS_PER_LONG == 32
+	unsigned long flags;
+	local_irq_save(flags);
+#else
 	preempt_disable();
+#endif
 	p = &s->stat_percpu[smp_processor_id()][entry];

 	if (!end) {
@ -478,7 +485,11 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 		p->ticks[idx] += duration;
 	}

+#if BITS_PER_LONG == 32
+	local_irq_restore(flags);
+#else
 	preempt_enable();
+#endif
 }

 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@ -2095,6 +2095,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
+	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;

@ -2104,7 +2105,6 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
-		ti->discard_zeroes_data_unsupported = true;
 	}
 	ti->private = pt;

@ -2689,8 +2689,16 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	 * They get transferred to the live pool in bind_control_target()
 	 * called from pool_preresume().
 	 */
-	if (!pt->adjusted_pf.discard_enabled)
+	if (!pt->adjusted_pf.discard_enabled) {
+		/*
+		 * Must explicitly disallow stacking discard limits otherwise the
+		 * block layer will stack them if pool's data device has support.
+		 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
+		 * user to see that, so make sure to set all discard limits to 0.
+		 */
+		limits->discard_granularity = 0;
 		return;
+	}

 	disable_passdown_if_not_supported(pt);

@ -2826,10 +2834,10 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);

 	/* In case the pool supports discards, pass them on. */
+	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		ti->discard_zeroes_data_unsupported = true;
 		/* Discard bios must be split on a block boundary */
 		ti->split_discard_bios = true;
 	}
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@ -211,10 +211,55 @@ struct dm_md_mempools {
 	struct bio_set *bs;
 };

-#define MIN_IOS 256
+#define RESERVED_BIO_BASED_IOS		16
+#define RESERVED_REQUEST_BASED_IOS	256
+#define RESERVED_MAX_IOS		1024
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_rq_tio_cache;

+/*
+ * Bio-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
+
+/*
+ * Request-based DM's mempools' reserved IOs set by the user.
+ */
+static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+
+static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+				      unsigned def, unsigned max)
+{
+	unsigned ios = ACCESS_ONCE(*reserved_ios);
+	unsigned modified_ios = 0;
+
+	if (!ios)
+		modified_ios = def;
+	else if (ios > max)
+		modified_ios = max;
+
+	if (modified_ios) {
+		(void)cmpxchg(reserved_ios, ios, modified_ios);
+		ios = modified_ios;
+	}
+
+	return ios;
+}
+
+unsigned dm_get_reserved_bio_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
+
+unsigned dm_get_reserved_rq_based_ios(void)
+{
+	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
+}
+EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@ -2277,6 +2322,17 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 	return md->immutable_target_type;
 }

+/*
+ * The queue_limits are only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
+{
+	BUG_ON(!atomic_read(&md->holders));
+	return &md->queue->limits;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue_limits);
+
 /*
 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
 */
@ -2862,18 +2918,18 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u

 	if (type == DM_TYPE_BIO_BASED) {
 		cachep = _io_cache;
-		pool_size = 16;
+		pool_size = dm_get_reserved_bio_based_ios();
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 	} else if (type == DM_TYPE_REQUEST_BASED) {
 		cachep = _rq_tio_cache;
-		pool_size = MIN_IOS;
+		pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
 		/* per_bio_data_size is not used. See __bind_mempools(). */
 		WARN_ON(per_bio_data_size != 0);
 	} else
 		goto out;

-	pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep);
+	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
 	if (!pools->io_pool)
 		goto out;

@ -2924,6 +2980,13 @@ module_exit(dm_exit);

 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
+
+module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
+
+module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@ -184,6 +184,9 @@ void dm_free_md_mempools(struct dm_md_mempools *pools);
 /*
 * Helpers that are used by DM core
 */
+unsigned dm_get_reserved_bio_based_ios(void);
+unsigned dm_get_reserved_rq_based_ios(void);
+
 static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen)
 {
 	return !maxlen || strlen(result) + 1 >= maxlen;
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@ -406,13 +406,14 @@ int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
 union map_info *dm_get_rq_mapinfo(struct request *rq);

+struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
+
 /*
 * Geometry functions.
 */
 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo);
 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo);

-
 /*-----------------------------------------------------------------
 * Functions for manipulating device-mapper tables.
 *---------------------------------------------------------------*/