- Revert a request-based DM core change that caused IO latency to

increase and adversely impact both throughput and system load
 
 - Fix for a use after free bug in DM core's device cleanup
 
 - A couple DM btree removal fixes (used by dm-thinp)
 
 - A DM thinp fix for order-5 allocation failure
 
 - A DM thinp fix to not degrade to read-only metadata mode when in
   out-of-data-space mode for longer than the 'no_space_timeout'
 
 - Fix a long-standing oversight in both dm-thinp and dm-cache by
   now exporting 'needs_check' in status if it was set in metadata
 
 - Fix an embarrassing dm-cache busy-loop that caused worker threads to
   eat cpu even if no IO was actively being issued to the cache device
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJVqUfSAAoJEMUj8QotnQNa+RUH+wXrHCGI6J7RHIXVd5igP9K0
 yFZGEnLZe6Ebt5CACLcKn/qN0g97iwCrlcxFt+1Gj/GbW1GIQzs7vg38La3PZxWZ
 jAkI3JMY816bP1x3VK1HtMsk2gRaE/hh0gxK5pPLB9a+ZdEsz9UML0rs+JseOdn3
 n+454dhwOyChwz7zFEbpn+mfjoruFScGX0Y2qaSHBV/xMhmExpthw9V1yFC2v2tW
 8cAHOMDLNLHhR5adF9YxjZH8wILbyYK9oPy3iGhj/TF/Dx7saWYG4UlnL5xIOLsB
 5WK9gRrJJ/Wf0FsDdN88AaY4Bdpj4esS2JeTZpvujxeBb7ZNeJoCUqyzggURv/c=
 =hCjo
 -----END PGP SIGNATURE-----

Merge tag 'dm-4.2-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:

 - revert a request-based DM core change that caused IO latency to
   increase and adversely impact both throughput and system load

 - fix for a use after free bug in DM core's device cleanup

 - a couple DM btree removal fixes (used by dm-thinp)

 - a DM thinp fix for order-5 allocation failure

 - a DM thinp fix to not degrade to read-only metadata mode when in
   out-of-data-space mode for longer than the 'no_space_timeout'

 - fix a long-standing oversight in both dm-thinp and dm-cache by now
   exporting 'needs_check' in status if it was set in metadata

 - fix an embarrassing dm-cache busy-loop that caused worker threads to
   eat cpu even if no IO was actively being issued to the cache device

* tag 'dm-4.2-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm cache: avoid calls to prealloc_free_structs() if possible
  dm cache: avoid preallocation if no work in writeback_some_dirty_blocks()
  dm cache: do not wake_worker() in free_migration()
  dm cache: display 'needs_check' in status if it is set
  dm thin: display 'needs_check' in status if it is set
  dm thin: stay in out-of-data-space mode once no_space_timeout expires
  dm: fix use after free crash due to incorrect cleanup sequence
  Revert "dm: only run the queue on completion if congested or no requests pending"
  dm btree: silence lockdep lock inversion in dm_btree_del()
  dm thin: allocate the cell_sort_array dynamically
  dm btree remove: fix bug in redistribute3
This commit is contained in:
Linus Torvalds 2015-07-17 20:53:57 -07:00
commit 3f8476fe89
7 changed files with 82 additions and 35 deletions

View File

@ -258,6 +258,12 @@ cache metadata mode : ro if read-only, rw if read-write
no further I/O will be permitted and the status will just
contain the string 'Fail'. The userspace recovery tools
should then be used.
needs_check : 'needs_check' if set, '-' if not set
A metadata operation has failed, resulting in the needs_check
flag being set in the metadata's superblock. The metadata
device must be deactivated and checked/repaired before the
cache can be made fully operational again. '-' indicates
needs_check is not set.
Messages
--------

View File

@ -296,7 +296,7 @@ ii) Status
underlying device. When this is enabled when loading the table,
it can get disabled if the underlying device doesn't support it.
ro|rw
ro|rw|out_of_data_space
If the pool encounters certain types of device failures it will
drop into a read-only metadata mode in which no changes to
the pool metadata (like allocating new blocks) are permitted.
@ -314,6 +314,13 @@ ii) Status
module parameter can be used to change this timeout -- it
defaults to 60 seconds but may be disabled using a value of 0.
needs_check
A metadata operation has failed, resulting in the needs_check
flag being set in the metadata's superblock. The metadata
device must be deactivated and checked/repaired before the
thin-pool can be made fully operational again. '-' indicates
needs_check is not set.
iii) Messages
create_thin <dev id>

View File

@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg)
wake_up(&cache->migration_wait);
mempool_free(mg, cache->migration_pool);
wake_worker(cache);
}
static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
@ -1947,6 +1946,7 @@ static int commit_if_needed(struct cache *cache)
static void process_deferred_bios(struct cache *cache)
{
bool prealloc_used = false;
unsigned long flags;
struct bio_list bios;
struct bio *bio;
@ -1981,13 +1981,16 @@ static void process_deferred_bios(struct cache *cache)
process_discard_bio(cache, &structs, bio);
else
process_bio(cache, &structs, bio);
prealloc_used = true;
}
prealloc_free_structs(cache, &structs);
if (prealloc_used)
prealloc_free_structs(cache, &structs);
}
static void process_deferred_cells(struct cache *cache)
{
bool prealloc_used = false;
unsigned long flags;
struct dm_bio_prison_cell *cell, *tmp;
struct list_head cells;
@ -2015,9 +2018,11 @@ static void process_deferred_cells(struct cache *cache)
}
process_cell(cache, &structs, cell);
prealloc_used = true;
}
prealloc_free_structs(cache, &structs);
if (prealloc_used)
prealloc_free_structs(cache, &structs);
}
static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
@ -2062,7 +2067,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
static void writeback_some_dirty_blocks(struct cache *cache)
{
int r = 0;
bool prealloc_used = false;
dm_oblock_t oblock;
dm_cblock_t cblock;
struct prealloc structs;
@ -2072,23 +2077,21 @@ static void writeback_some_dirty_blocks(struct cache *cache)
memset(&structs, 0, sizeof(structs));
while (spare_migration_bandwidth(cache)) {
if (prealloc_data_structs(cache, &structs))
break;
if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
break; /* no work to do */
r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
if (r)
break;
r = get_cell(cache, oblock, &structs, &old_ocell);
if (r) {
if (prealloc_data_structs(cache, &structs) ||
get_cell(cache, oblock, &structs, &old_ocell)) {
policy_set_dirty(cache->policy, oblock);
break;
}
writeback(cache, &structs, oblock, cblock, old_ocell);
prealloc_used = true;
}
prealloc_free_structs(cache, &structs);
if (prealloc_used)
prealloc_free_structs(cache, &structs);
}
/*----------------------------------------------------------------
@ -3496,7 +3499,7 @@ static void cache_resume(struct dm_target *ti)
* <#demotions> <#promotions> <#dirty>
* <#features> <features>*
* <#core args> <core args>
* <policy name> <#policy args> <policy args>* <cache metadata mode>
* <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
*/
static void cache_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@ -3582,6 +3585,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("rw ");
if (dm_cache_metadata_needs_check(cache->cmd))
DMEMIT("needs_check ");
else
DMEMIT("- ");
break;
case STATUSTYPE_TABLE:
@ -3820,7 +3828,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type cache_target = {
.name = "cache",
.version = {1, 7, 0},
.version = {1, 8, 0},
.module = THIS_MODULE,
.ctr = cache_ctr,
.dtr = cache_dtr,

View File

@ -18,6 +18,7 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/sort.h>
#include <linux/rbtree.h>
@ -268,7 +269,7 @@ struct pool {
process_mapping_fn process_prepared_mapping;
process_mapping_fn process_prepared_discard;
struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
struct dm_bio_prison_cell **cell_sort_array;
};
static enum pool_mode get_pool_mode(struct pool *pool);
@ -2281,18 +2282,23 @@ static void do_waker(struct work_struct *ws)
queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
}
static void notify_of_pool_mode_change_to_oods(struct pool *pool);
/*
* We're holding onto IO to allow userland time to react. After the
* timeout either the pool will have been resized (and thus back in
* PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
* PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
*/
static void do_no_space_timeout(struct work_struct *ws)
{
struct pool *pool = container_of(to_delayed_work(ws), struct pool,
no_space_timeout);
if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
set_pool_mode(pool, PM_READ_ONLY);
if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
pool->pf.error_if_no_space = true;
notify_of_pool_mode_change_to_oods(pool);
error_retry_list(pool);
}
}
/*----------------------------------------------------------------*/
@ -2370,6 +2376,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
dm_device_name(pool->pool_md), new_mode);
}
static void notify_of_pool_mode_change_to_oods(struct pool *pool)
{
if (!pool->pf.error_if_no_space)
notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
else
notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
}
static bool passdown_enabled(struct pool_c *pt)
{
return pt->adjusted_pf.discard_passdown;
@ -2454,7 +2468,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
* frequently seeing this mode.
*/
if (old_mode != new_mode)
notify_of_pool_mode_change(pool, "out-of-data-space");
notify_of_pool_mode_change_to_oods(pool);
pool->process_bio = process_bio_read_only;
pool->process_discard = process_discard_bio;
pool->process_cell = process_cell_read_only;
@ -2777,6 +2791,7 @@ static void __pool_destroy(struct pool *pool)
{
__pool_table_remove(pool);
vfree(pool->cell_sort_array);
if (dm_pool_metadata_close(pool->pmd) < 0)
DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
@ -2889,6 +2904,13 @@ static struct pool *pool_create(struct mapped_device *pool_md,
goto bad_mapping_pool;
}
pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
if (!pool->cell_sort_array) {
*error = "Error allocating cell sort array";
err_p = ERR_PTR(-ENOMEM);
goto bad_sort_array;
}
pool->ref_count = 1;
pool->last_commit_jiffies = jiffies;
pool->pool_md = pool_md;
@ -2897,6 +2919,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
return pool;
bad_sort_array:
mempool_destroy(pool->mapping_pool);
bad_mapping_pool:
dm_deferred_set_destroy(pool->all_io_ds);
bad_all_io_ds:
@ -3714,6 +3738,7 @@ static void emit_flags(struct pool_features *pf, char *result,
* Status line is:
* <transaction id> <used metadata sectors>/<total metadata sectors>
* <used data sectors>/<total data sectors> <held metadata root>
* <pool mode> <discard config> <no space config> <needs_check>
*/
static void pool_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
@ -3815,6 +3840,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("queue_if_no_space ");
if (dm_pool_metadata_needs_check(pool->pmd))
DMEMIT("needs_check ");
else
DMEMIT("- ");
break;
case STATUSTYPE_TABLE:
@ -3918,7 +3948,7 @@ static struct target_type pool_target = {
.name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE,
.version = {1, 15, 0},
.version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = pool_ctr,
.dtr = pool_dtr,
@ -4305,7 +4335,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
static struct target_type thin_target = {
.name = "thin",
.version = {1, 15, 0},
.version = {1, 16, 0},
.module = THIS_MODULE,
.ctr = thin_ctr,
.dtr = thin_dtr,

View File

@ -1067,13 +1067,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
*/
static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
{
int nr_requests_pending;
atomic_dec(&md->pending[rw]);
/* nudge anyone waiting on suspend queue */
nr_requests_pending = md_in_flight(md);
if (!nr_requests_pending)
if (!md_in_flight(md))
wake_up(&md->wait);
/*
@ -1085,8 +1082,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
if (run_queue) {
if (md->queue->mq_ops)
blk_mq_run_hw_queues(md->queue, true);
else if (!nr_requests_pending ||
(nr_requests_pending >= md->queue->nr_congestion_on))
else
blk_run_queue_async(md->queue);
}
@ -2281,8 +2277,6 @@ static void dm_init_old_md_queue(struct mapped_device *md)
static void cleanup_mapped_device(struct mapped_device *md)
{
cleanup_srcu_struct(&md->io_barrier);
if (md->wq)
destroy_workqueue(md->wq);
if (md->kworker_task)
@ -2294,6 +2288,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
if (md->bs)
bioset_free(md->bs);
cleanup_srcu_struct(&md->io_barrier);
if (md->disk) {
spin_lock(&_minor_lock);
md->disk->private_data = NULL;

View File

@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s < 0 && nr_center < -s) {
/* not enough in central node */
shift(left, center, nr_center);
s = nr_center - target;
shift(left, center, -nr_center);
s += nr_center;
shift(left, right, s);
nr_right += s;
} else
@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
if (s > 0 && nr_center < s) {
/* not enough in central node */
shift(center, right, nr_center);
s = target - nr_center;
s -= nr_center;
shift(left, right, s);
nr_left -= s;
} else

View File

@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
int r;
struct del_stack *s;
s = kmalloc(sizeof(*s), GFP_KERNEL);
s = kmalloc(sizeof(*s), GFP_NOIO);
if (!s)
return -ENOMEM;
s->info = info;