s390/dasd: fix unusable device after safe offline processing

The safe offline processing needs, as well as the normal offline
processing, to be locked against multiple parallel executions. But it
should be able to be overtaken by a normal offline processing to make sure
that the device does not wait forever for outstanding I/O if the user
wants to.

Unfortunately the parallel processing of safe offline and normal offline
might lead to a race situation where both threads report successful
execution to the CIO layer which in turn tries to deregister the kobject
of the device twice. This leads to a

refcount_t: underflow; use-after-free.

error and the device is not able to be set online again afterwards without
a reboot.

Correct the locking of the safe offline processing by doing the following:
	- Use the cdev lock to secure all set and test operations to the
	  device flags.
	- Two safe offline processes are locked against each other using
	  the DASD_FLAG_SAFE_OFFLINE and DASD_FLAG_SAFE_OFFLINE_RUNNING
	  device flags.
	  The differentiation between offline triggered and offline running
	  is needed since the normal offline attribute is owned by CIO and
	  we have to pass over control in between.
	- The dasd_generic_set_offline process handles the offline
	  processing. It is locked against parallel execution using the
	  DASD_FLAG_OFFLINE.
	- Only a running safe offline should be able to be overtaken by a
	  single normal offline. This is ensured by clearing the
	  DASD_FLAG_SAFE_OFFLINE_RUNNING flag when a normal offline
	  overtakes. So this can only happen ones.
	- The safe offline just aborts in this case doing nothing and
	  the normal offline processing finishes as usual.

Signed-off-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
Stefan Haberland 2017-05-16 10:30:13 +02:00 committed by Martin Schwidefsky
parent b487a914f8
commit 2757fe1d8e
2 changed files with 44 additions and 31 deletions

View File

@ -3562,57 +3562,69 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
else else
pr_warn("%s: The DASD cannot be set offline while it is in use\n", pr_warn("%s: The DASD cannot be set offline while it is in use\n",
dev_name(&cdev->dev)); dev_name(&cdev->dev));
clear_bit(DASD_FLAG_OFFLINE, &device->flags); rc = -EBUSY;
goto out_busy; goto out_err;
} }
} }
if (test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { /*
/* * Test if the offline processing is already running and exit if so.
* safe offline already running * If a safe offline is being processed this could only be a normal
* could only be called by normal offline so safe_offline flag * offline that should be able to overtake the safe offline and
* needs to be removed to run normal offline and kill all I/O * cancel any I/O we do not want to wait for any longer
*/ */
if (test_and_set_bit(DASD_FLAG_OFFLINE, &device->flags)) if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) {
/* Already doing normal offline processing */ if (test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
goto out_busy; clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING,
else &device->flags);
clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); } else {
} else { rc = -EBUSY;
if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) goto out_err;
/* Already doing offline processing */ }
goto out_busy;
} }
set_bit(DASD_FLAG_OFFLINE, &device->flags); set_bit(DASD_FLAG_OFFLINE, &device->flags);
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
/* /*
* if safe_offline called set safe_offline_running flag and * if safe_offline is called set safe_offline_running flag and
* clear safe_offline so that a call to normal offline * clear safe_offline so that a call to normal offline
* can overrun safe_offline processing * can overrun safe_offline processing
*/ */
if (test_and_clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags) && if (test_and_clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags) &&
!test_and_set_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { !test_and_set_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
/* need to unlock here to wait for outstanding I/O */
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
/* /*
* If we want to set the device safe offline all IO operations * If we want to set the device safe offline all IO operations
* should be finished before continuing the offline process * should be finished before continuing the offline process
* so sync bdev first and then wait for our queues to become * so sync bdev first and then wait for our queues to become
* empty * empty
*/ */
/* sync blockdev and partitions */
if (device->block) { if (device->block) {
rc = fsync_bdev(device->block->bdev); rc = fsync_bdev(device->block->bdev);
if (rc != 0) if (rc != 0)
goto interrupted; goto interrupted;
} }
/* schedule device tasklet and wait for completion */
dasd_schedule_device_bh(device); dasd_schedule_device_bh(device);
rc = wait_event_interruptible(shutdown_waitq, rc = wait_event_interruptible(shutdown_waitq,
_wait_for_empty_queues(device)); _wait_for_empty_queues(device));
if (rc != 0) if (rc != 0)
goto interrupted; goto interrupted;
/*
* check if a normal offline process overtook the offline
* processing in this case simply do nothing beside returning
* that we got interrupted
* otherwise mark safe offline as not running any longer and
* continue with normal offline
*/
spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
if (!test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
rc = -ERESTARTSYS;
goto out_err;
}
clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags);
} }
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
dasd_set_target_state(device, DASD_STATE_NEW); dasd_set_target_state(device, DASD_STATE_NEW);
/* dasd_delete_device destroys the device reference. */ /* dasd_delete_device destroys the device reference. */
@ -3624,22 +3636,18 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
*/ */
if (block) if (block)
dasd_free_block(block); dasd_free_block(block);
return 0; return 0;
interrupted: interrupted:
/* interrupted by signal */ /* interrupted by signal */
clear_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags); clear_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags);
clear_bit(DASD_FLAG_OFFLINE, &device->flags); clear_bit(DASD_FLAG_OFFLINE, &device->flags);
dasd_put_device(device); out_err:
return rc;
out_busy:
dasd_put_device(device); dasd_put_device(device);
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags); spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
return rc;
return -EBUSY;
} }
EXPORT_SYMBOL_GPL(dasd_generic_set_offline); EXPORT_SYMBOL_GPL(dasd_generic_set_offline);

View File

@ -950,11 +950,14 @@ dasd_safe_offline_store(struct device *dev, struct device_attribute *attr,
{ {
struct ccw_device *cdev = to_ccwdev(dev); struct ccw_device *cdev = to_ccwdev(dev);
struct dasd_device *device; struct dasd_device *device;
unsigned long flags;
int rc; int rc;
device = dasd_device_from_cdev(cdev); spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
device = dasd_device_from_cdev_locked(cdev);
if (IS_ERR(device)) { if (IS_ERR(device)) {
rc = PTR_ERR(device); rc = PTR_ERR(device);
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
goto out; goto out;
} }
@ -962,12 +965,14 @@ dasd_safe_offline_store(struct device *dev, struct device_attribute *attr,
test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) { test_bit(DASD_FLAG_SAFE_OFFLINE_RUNNING, &device->flags)) {
/* Already doing offline processing */ /* Already doing offline processing */
dasd_put_device(device); dasd_put_device(device);
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
rc = -EBUSY; rc = -EBUSY;
goto out; goto out;
} }
set_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags); set_bit(DASD_FLAG_SAFE_OFFLINE, &device->flags);
dasd_put_device(device); dasd_put_device(device);
spin_unlock_irqrestore(get_ccwdev_lock(cdev), flags);
rc = ccw_device_set_offline(cdev); rc = ccw_device_set_offline(cdev);