[SCSI] mpt2sas: Better handling DEAD IOC (PCI-E LInk down) error condition

Detection of Dead IOC has been done in fault_reset_work thread.

If IOC Doorbell is 0xFFFFFFFF, it will be detected as non-operation/DEAD IOC.
When a DEAD IOC is detected, the code is modified to remove that IOC and
all its attached devices from OS.
The PCI layer API pci_remove_bus_device() is called to remove the dead IOC.

Signed-off-by: Nagalakshmi Nandigama <nagalakshmi.nandigama@lsi.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
This commit is contained in:
nagalakshmi.nandigama@lsi.com 2011-12-01 07:42:04 +05:30 committed by James Bottomley
parent 4053a4be52
commit 845a0e40af
3 changed files with 63 additions and 0 deletions

View File

@ -57,6 +57,7 @@
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/io.h> #include <linux/io.h>
#include <linux/time.h> #include <linux/time.h>
#include <linux/kthread.h>
#include <linux/aer.h> #include <linux/aer.h>
#include "mpt2sas_base.h" #include "mpt2sas_base.h"
@ -120,9 +121,33 @@ _scsih_set_fwfault_debug(const char *val, struct kernel_param *kp)
ioc->fwfault_debug = mpt2sas_fwfault_debug; ioc->fwfault_debug = mpt2sas_fwfault_debug;
return 0; return 0;
} }
module_param_call(mpt2sas_fwfault_debug, _scsih_set_fwfault_debug, module_param_call(mpt2sas_fwfault_debug, _scsih_set_fwfault_debug,
param_get_int, &mpt2sas_fwfault_debug, 0644); param_get_int, &mpt2sas_fwfault_debug, 0644);
/**
* mpt2sas_remove_dead_ioc_func - kthread context to remove dead ioc
* @arg: input argument, used to derive ioc
*
* Return 0 if controller is removed from pci subsystem.
* Return -1 for other case.
*/
static int mpt2sas_remove_dead_ioc_func(void *arg)
{
struct MPT2SAS_ADAPTER *ioc = (struct MPT2SAS_ADAPTER *)arg;
struct pci_dev *pdev;
if ((ioc == NULL))
return -1;
pdev = ioc->pdev;
if ((pdev == NULL))
return -1;
pci_remove_bus_device(pdev);
return 0;
}
/** /**
* _base_fault_reset_work - workq handling ioc fault conditions * _base_fault_reset_work - workq handling ioc fault conditions
* @work: input argument, used to derive ioc * @work: input argument, used to derive ioc
@ -138,6 +163,7 @@ _base_fault_reset_work(struct work_struct *work)
unsigned long flags; unsigned long flags;
u32 doorbell; u32 doorbell;
int rc; int rc;
struct task_struct *p;
spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags); spin_lock_irqsave(&ioc->ioc_reset_in_progress_lock, flags);
if (ioc->shost_recovery) if (ioc->shost_recovery)
@ -145,6 +171,39 @@ _base_fault_reset_work(struct work_struct *work)
spin_unlock_irqrestore(&ioc->ioc_reset_in_progress_lock, flags); spin_unlock_irqrestore(&ioc->ioc_reset_in_progress_lock, flags);
doorbell = mpt2sas_base_get_iocstate(ioc, 0); doorbell = mpt2sas_base_get_iocstate(ioc, 0);
if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_MASK) {
printk(MPT2SAS_INFO_FMT "%s : SAS host is non-operational !!!!\n",
ioc->name, __func__);
/*
* Call _scsih_flush_pending_cmds callback so that we flush all
* pending commands back to OS. This call is required to aovid
* deadlock at block layer. Dead IOC will fail to do diag reset,
* and this call is safe since dead ioc will never return any
* command back from HW.
*/
ioc->schedule_dead_ioc_flush_running_cmds(ioc);
/*
* Set remove_host flag early since kernel thread will
* take some time to execute.
*/
ioc->remove_host = 1;
/*Remove the Dead Host */
p = kthread_run(mpt2sas_remove_dead_ioc_func, ioc,
"mpt2sas_dead_ioc_%d", ioc->id);
if (IS_ERR(p)) {
printk(MPT2SAS_ERR_FMT
"%s: Running mpt2sas_dead_ioc thread failed !!!!\n",
ioc->name, __func__);
} else {
printk(MPT2SAS_ERR_FMT
"%s: Running mpt2sas_dead_ioc thread success !!!!\n",
ioc->name, __func__);
}
return; /* don't rearm timer */
}
if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) { if ((doorbell & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
rc = mpt2sas_base_hard_reset_handler(ioc, CAN_SLEEP, rc = mpt2sas_base_hard_reset_handler(ioc, CAN_SLEEP,
FORCE_BIG_HAMMER); FORCE_BIG_HAMMER);

View File

@ -623,6 +623,7 @@ enum mutex_type {
TM_MUTEX_ON = 1, TM_MUTEX_ON = 1,
}; };
typedef void (*MPT2SAS_FLUSH_RUNNING_CMDS)(struct MPT2SAS_ADAPTER *ioc);
/** /**
* struct MPT2SAS_ADAPTER - per adapter struct * struct MPT2SAS_ADAPTER - per adapter struct
* @list: ioc_list * @list: ioc_list
@ -665,6 +666,7 @@ enum mutex_type {
* @msix_vector_count: number msix vectors * @msix_vector_count: number msix vectors
* @cpu_msix_table: table for mapping cpus to msix index * @cpu_msix_table: table for mapping cpus to msix index
* @cpu_msix_table_sz: table size * @cpu_msix_table_sz: table size
* @schedule_dead_ioc_flush_running_cmds: callback to flush pending commands
* @scsi_io_cb_idx: shost generated commands * @scsi_io_cb_idx: shost generated commands
* @tm_cb_idx: task management commands * @tm_cb_idx: task management commands
* @scsih_cb_idx: scsih internal commands * @scsih_cb_idx: scsih internal commands
@ -816,6 +818,7 @@ struct MPT2SAS_ADAPTER {
resource_size_t **reply_post_host_index; resource_size_t **reply_post_host_index;
u16 cpu_msix_table_sz; u16 cpu_msix_table_sz;
u32 ioc_reset_count; u32 ioc_reset_count;
MPT2SAS_FLUSH_RUNNING_CMDS schedule_dead_ioc_flush_running_cmds;
/* internal commands, callback index */ /* internal commands, callback index */
u8 scsi_io_cb_idx; u8 scsi_io_cb_idx;

View File

@ -7928,6 +7928,7 @@ _scsih_probe(struct pci_dev *pdev, const struct pci_device_id *id)
ioc->tm_tr_volume_cb_idx = tm_tr_volume_cb_idx; ioc->tm_tr_volume_cb_idx = tm_tr_volume_cb_idx;
ioc->tm_sas_control_cb_idx = tm_sas_control_cb_idx; ioc->tm_sas_control_cb_idx = tm_sas_control_cb_idx;
ioc->logging_level = logging_level; ioc->logging_level = logging_level;
ioc->schedule_dead_ioc_flush_running_cmds = &_scsih_flush_running_cmds;
/* misc semaphores and spin locks */ /* misc semaphores and spin locks */
mutex_init(&ioc->reset_in_progress_mutex); mutex_init(&ioc->reset_in_progress_mutex);
spin_lock_init(&ioc->ioc_reset_in_progress_lock); spin_lock_init(&ioc->ioc_reset_in_progress_lock);