linux/drivers/scsi/libata-eh.c
Tejun Heo f3e81b19aa [PATCH] libata-eh: implement ata_eh_info and ata_eh_context
struct ata_eh_info serves as the communication channel between
execution path and EH.  Execution path describes detected error
condition in ap->eh_info and EH recovers the port using it.  To avoid
missing error conditions detected during EH, EH makes its own copy of
eh_info and clears it on entry allowing error info to accumulate
during EH.

Most EH states including EH's copy of eh_info are stored in
ap->eh_context (struct ata_eh_context) which is owned by EH and thus
doesn't require any synchronization to access and alter.  This
standardized context makes it easy to integrate various parts of EH
and extend EH to handle multiple links (for PM).

Signed-off-by: Tejun Heo <htejun@gmail.com>
2006-05-15 20:58:21 +09:00

629 lines
15 KiB
C

/*
* libata-eh.c - libata error handling
*
* Maintained by: Jeff Garzik <jgarzik@pobox.com>
* Please ALWAYS copy linux-ide@vger.kernel.org
* on emails.
*
* Copyright 2006 Tejun Heo <htejun@gmail.com>
*
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; see the file COPYING. If not, write to
* the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
* USA.
*
*
* libata documentation is available via 'make {ps|pdf}docs',
* as Documentation/DocBook/libata.*
*
* Hardware documentation available from http://www.t13.org/ and
* http://www.sata-io.org/
*
*/
#include <linux/config.h>
#include <linux/kernel.h>
#include <scsi/scsi.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_eh.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
#include <linux/libata.h>
#include "libata.h"
static void __ata_port_freeze(struct ata_port *ap);
static void ata_ering_record(struct ata_ering *ering, int is_io,
unsigned int err_mask)
{
struct ata_ering_entry *ent;
WARN_ON(!err_mask);
ering->cursor++;
ering->cursor %= ATA_ERING_SIZE;
ent = &ering->ring[ering->cursor];
ent->is_io = is_io;
ent->err_mask = err_mask;
ent->timestamp = get_jiffies_64();
}
static struct ata_ering_entry * ata_ering_top(struct ata_ering *ering)
{
struct ata_ering_entry *ent = &ering->ring[ering->cursor];
if (!ent->err_mask)
return NULL;
return ent;
}
static int ata_ering_map(struct ata_ering *ering,
int (*map_fn)(struct ata_ering_entry *, void *),
void *arg)
{
int idx, rc = 0;
struct ata_ering_entry *ent;
idx = ering->cursor;
do {
ent = &ering->ring[idx];
if (!ent->err_mask)
break;
rc = map_fn(ent, arg);
if (rc)
break;
idx = (idx - 1 + ATA_ERING_SIZE) % ATA_ERING_SIZE;
} while (idx != ering->cursor);
return rc;
}
/**
* ata_scsi_timed_out - SCSI layer time out callback
* @cmd: timed out SCSI command
*
* Handles SCSI layer timeout. We race with normal completion of
* the qc for @cmd. If the qc is already gone, we lose and let
* the scsi command finish (EH_HANDLED). Otherwise, the qc has
* timed out and EH should be invoked. Prevent ata_qc_complete()
* from finishing it by setting EH_SCHEDULED and return
* EH_NOT_HANDLED.
*
* TODO: kill this function once old EH is gone.
*
* LOCKING:
* Called from timer context
*
* RETURNS:
* EH_HANDLED or EH_NOT_HANDLED
*/
enum scsi_eh_timer_return ata_scsi_timed_out(struct scsi_cmnd *cmd)
{
struct Scsi_Host *host = cmd->device->host;
struct ata_port *ap = ata_shost_to_port(host);
unsigned long flags;
struct ata_queued_cmd *qc;
enum scsi_eh_timer_return ret;
DPRINTK("ENTER\n");
if (ap->ops->error_handler) {
ret = EH_NOT_HANDLED;
goto out;
}
ret = EH_HANDLED;
spin_lock_irqsave(&ap->host_set->lock, flags);
qc = ata_qc_from_tag(ap, ap->active_tag);
if (qc) {
WARN_ON(qc->scsicmd != cmd);
qc->flags |= ATA_QCFLAG_EH_SCHEDULED;
qc->err_mask |= AC_ERR_TIMEOUT;
ret = EH_NOT_HANDLED;
}
spin_unlock_irqrestore(&ap->host_set->lock, flags);
out:
DPRINTK("EXIT, ret=%d\n", ret);
return ret;
}
/**
* ata_scsi_error - SCSI layer error handler callback
* @host: SCSI host on which error occurred
*
* Handles SCSI-layer-thrown error events.
*
* LOCKING:
* Inherited from SCSI layer (none, can sleep)
*
* RETURNS:
* Zero.
*/
void ata_scsi_error(struct Scsi_Host *host)
{
struct ata_port *ap = ata_shost_to_port(host);
spinlock_t *hs_lock = &ap->host_set->lock;
int i, repeat_cnt = ATA_EH_MAX_REPEAT;
unsigned long flags;
DPRINTK("ENTER\n");
/* synchronize with port task */
ata_port_flush_task(ap);
/* synchronize with host_set lock and sort out timeouts */
/* For new EH, all qcs are finished in one of three ways -
* normal completion, error completion, and SCSI timeout.
* Both cmpletions can race against SCSI timeout. When normal
* completion wins, the qc never reaches EH. When error
* completion wins, the qc has ATA_QCFLAG_FAILED set.
*
* When SCSI timeout wins, things are a bit more complex.
* Normal or error completion can occur after the timeout but
* before this point. In such cases, both types of
* completions are honored. A scmd is determined to have
* timed out iff its associated qc is active and not failed.
*/
if (ap->ops->error_handler) {
struct scsi_cmnd *scmd, *tmp;
int nr_timedout = 0;
spin_lock_irqsave(hs_lock, flags);
list_for_each_entry_safe(scmd, tmp, &host->eh_cmd_q, eh_entry) {
struct ata_queued_cmd *qc;
for (i = 0; i < ATA_MAX_QUEUE; i++) {
qc = __ata_qc_from_tag(ap, i);
if (qc->flags & ATA_QCFLAG_ACTIVE &&
qc->scsicmd == scmd)
break;
}
if (i < ATA_MAX_QUEUE) {
/* the scmd has an associated qc */
if (!(qc->flags & ATA_QCFLAG_FAILED)) {
/* which hasn't failed yet, timeout */
qc->err_mask |= AC_ERR_TIMEOUT;
qc->flags |= ATA_QCFLAG_FAILED;
nr_timedout++;
}
} else {
/* Normal completion occurred after
* SCSI timeout but before this point.
* Successfully complete it.
*/
scmd->retries = scmd->allowed;
scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
}
}
/* If we have timed out qcs. They belong to EH from
* this point but the state of the controller is
* unknown. Freeze the port to make sure the IRQ
* handler doesn't diddle with those qcs. This must
* be done atomically w.r.t. setting QCFLAG_FAILED.
*/
if (nr_timedout)
__ata_port_freeze(ap);
spin_unlock_irqrestore(hs_lock, flags);
} else
spin_unlock_wait(hs_lock);
repeat:
/* invoke error handler */
if (ap->ops->error_handler) {
/* fetch & clear EH info */
spin_lock_irqsave(hs_lock, flags);
memset(&ap->eh_context, 0, sizeof(ap->eh_context));
ap->eh_context.i = ap->eh_info;
memset(&ap->eh_info, 0, sizeof(ap->eh_info));
ap->flags &= ~ATA_FLAG_EH_PENDING;
spin_unlock_irqrestore(hs_lock, flags);
/* invoke EH */
ap->ops->error_handler(ap);
/* Exception might have happend after ->error_handler
* recovered the port but before this point. Repeat
* EH in such case.
*/
spin_lock_irqsave(hs_lock, flags);
if (ap->flags & ATA_FLAG_EH_PENDING) {
if (--repeat_cnt) {
ata_port_printk(ap, KERN_INFO,
"EH pending after completion, "
"repeating EH (cnt=%d)\n", repeat_cnt);
spin_unlock_irqrestore(hs_lock, flags);
goto repeat;
}
ata_port_printk(ap, KERN_ERR, "EH pending after %d "
"tries, giving up\n", ATA_EH_MAX_REPEAT);
}
/* this run is complete, make sure EH info is clear */
memset(&ap->eh_info, 0, sizeof(ap->eh_info));
/* Clear host_eh_scheduled while holding hs_lock such
* that if exception occurs after this point but
* before EH completion, SCSI midlayer will
* re-initiate EH.
*/
host->host_eh_scheduled = 0;
spin_unlock_irqrestore(hs_lock, flags);
} else {
WARN_ON(ata_qc_from_tag(ap, ap->active_tag) == NULL);
ap->ops->eng_timeout(ap);
}
/* finish or retry handled scmd's and clean up */
WARN_ON(host->host_failed || !list_empty(&host->eh_cmd_q));
scsi_eh_flush_done_q(&ap->eh_done_q);
/* clean up */
spin_lock_irqsave(hs_lock, flags);
if (ap->flags & ATA_FLAG_RECOVERED)
ata_port_printk(ap, KERN_INFO, "EH complete\n");
ap->flags &= ~ATA_FLAG_RECOVERED;
spin_unlock_irqrestore(hs_lock, flags);
DPRINTK("EXIT\n");
}
/**
* ata_qc_timeout - Handle timeout of queued command
* @qc: Command that timed out
*
* Some part of the kernel (currently, only the SCSI layer)
* has noticed that the active command on port @ap has not
* completed after a specified length of time. Handle this
* condition by disabling DMA (if necessary) and completing
* transactions, with error if necessary.
*
* This also handles the case of the "lost interrupt", where
* for some reason (possibly hardware bug, possibly driver bug)
* an interrupt was not delivered to the driver, even though the
* transaction completed successfully.
*
* TODO: kill this function once old EH is gone.
*
* LOCKING:
* Inherited from SCSI layer (none, can sleep)
*/
static void ata_qc_timeout(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct ata_host_set *host_set = ap->host_set;
u8 host_stat = 0, drv_stat;
unsigned long flags;
DPRINTK("ENTER\n");
ap->hsm_task_state = HSM_ST_IDLE;
spin_lock_irqsave(&host_set->lock, flags);
switch (qc->tf.protocol) {
case ATA_PROT_DMA:
case ATA_PROT_ATAPI_DMA:
host_stat = ap->ops->bmdma_status(ap);
/* before we do anything else, clear DMA-Start bit */
ap->ops->bmdma_stop(qc);
/* fall through */
default:
ata_altstatus(ap);
drv_stat = ata_chk_status(ap);
/* ack bmdma irq events */
ap->ops->irq_clear(ap);
ata_dev_printk(qc->dev, KERN_ERR, "command 0x%x timeout, "
"stat 0x%x host_stat 0x%x\n",
qc->tf.command, drv_stat, host_stat);
/* complete taskfile transaction */
qc->err_mask |= ac_err_mask(drv_stat);
break;
}
spin_unlock_irqrestore(&host_set->lock, flags);
ata_eh_qc_complete(qc);
DPRINTK("EXIT\n");
}
/**
* ata_eng_timeout - Handle timeout of queued command
* @ap: Port on which timed-out command is active
*
* Some part of the kernel (currently, only the SCSI layer)
* has noticed that the active command on port @ap has not
* completed after a specified length of time. Handle this
* condition by disabling DMA (if necessary) and completing
* transactions, with error if necessary.
*
* This also handles the case of the "lost interrupt", where
* for some reason (possibly hardware bug, possibly driver bug)
* an interrupt was not delivered to the driver, even though the
* transaction completed successfully.
*
* TODO: kill this function once old EH is gone.
*
* LOCKING:
* Inherited from SCSI layer (none, can sleep)
*/
void ata_eng_timeout(struct ata_port *ap)
{
DPRINTK("ENTER\n");
ata_qc_timeout(ata_qc_from_tag(ap, ap->active_tag));
DPRINTK("EXIT\n");
}
/**
* ata_qc_schedule_eh - schedule qc for error handling
* @qc: command to schedule error handling for
*
* Schedule error handling for @qc. EH will kick in as soon as
* other commands are drained.
*
* LOCKING:
* spin_lock_irqsave(host_set lock)
*/
void ata_qc_schedule_eh(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
WARN_ON(!ap->ops->error_handler);
qc->flags |= ATA_QCFLAG_FAILED;
qc->ap->flags |= ATA_FLAG_EH_PENDING;
/* The following will fail if timeout has already expired.
* ata_scsi_error() takes care of such scmds on EH entry.
* Note that ATA_QCFLAG_FAILED is unconditionally set after
* this function completes.
*/
scsi_req_abort_cmd(qc->scsicmd);
}
/**
* ata_port_schedule_eh - schedule error handling without a qc
* @ap: ATA port to schedule EH for
*
* Schedule error handling for @ap. EH will kick in as soon as
* all commands are drained.
*
* LOCKING:
* spin_lock_irqsave(host_set lock)
*/
void ata_port_schedule_eh(struct ata_port *ap)
{
WARN_ON(!ap->ops->error_handler);
ap->flags |= ATA_FLAG_EH_PENDING;
ata_schedule_scsi_eh(ap->host);
DPRINTK("port EH scheduled\n");
}
/**
* ata_port_abort - abort all qc's on the port
* @ap: ATA port to abort qc's for
*
* Abort all active qc's of @ap and schedule EH.
*
* LOCKING:
* spin_lock_irqsave(host_set lock)
*
* RETURNS:
* Number of aborted qc's.
*/
int ata_port_abort(struct ata_port *ap)
{
int tag, nr_aborted = 0;
WARN_ON(!ap->ops->error_handler);
for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
struct ata_queued_cmd *qc = ata_qc_from_tag(ap, tag);
if (qc) {
qc->flags |= ATA_QCFLAG_FAILED;
ata_qc_complete(qc);
nr_aborted++;
}
}
if (!nr_aborted)
ata_port_schedule_eh(ap);
return nr_aborted;
}
/**
* __ata_port_freeze - freeze port
* @ap: ATA port to freeze
*
* This function is called when HSM violation or some other
* condition disrupts normal operation of the port. Frozen port
* is not allowed to perform any operation until the port is
* thawed, which usually follows a successful reset.
*
* ap->ops->freeze() callback can be used for freezing the port
* hardware-wise (e.g. mask interrupt and stop DMA engine). If a
* port cannot be frozen hardware-wise, the interrupt handler
* must ack and clear interrupts unconditionally while the port
* is frozen.
*
* LOCKING:
* spin_lock_irqsave(host_set lock)
*/
static void __ata_port_freeze(struct ata_port *ap)
{
WARN_ON(!ap->ops->error_handler);
if (ap->ops->freeze)
ap->ops->freeze(ap);
ap->flags |= ATA_FLAG_FROZEN;
DPRINTK("ata%u port frozen\n", ap->id);
}
/**
* ata_port_freeze - abort & freeze port
* @ap: ATA port to freeze
*
* Abort and freeze @ap.
*
* LOCKING:
* spin_lock_irqsave(host_set lock)
*
* RETURNS:
* Number of aborted commands.
*/
int ata_port_freeze(struct ata_port *ap)
{
int nr_aborted;
WARN_ON(!ap->ops->error_handler);
nr_aborted = ata_port_abort(ap);
__ata_port_freeze(ap);
return nr_aborted;
}
/**
* ata_eh_freeze_port - EH helper to freeze port
* @ap: ATA port to freeze
*
* Freeze @ap.
*
* LOCKING:
* None.
*/
void ata_eh_freeze_port(struct ata_port *ap)
{
unsigned long flags;
if (!ap->ops->error_handler)
return;
spin_lock_irqsave(&ap->host_set->lock, flags);
__ata_port_freeze(ap);
spin_unlock_irqrestore(&ap->host_set->lock, flags);
}
/**
* ata_port_thaw_port - EH helper to thaw port
* @ap: ATA port to thaw
*
* Thaw frozen port @ap.
*
* LOCKING:
* None.
*/
void ata_eh_thaw_port(struct ata_port *ap)
{
unsigned long flags;
if (!ap->ops->error_handler)
return;
spin_lock_irqsave(&ap->host_set->lock, flags);
ap->flags &= ~ATA_FLAG_FROZEN;
if (ap->ops->thaw)
ap->ops->thaw(ap);
spin_unlock_irqrestore(&ap->host_set->lock, flags);
DPRINTK("ata%u port thawed\n", ap->id);
}
static void ata_eh_scsidone(struct scsi_cmnd *scmd)
{
/* nada */
}
static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
{
struct ata_port *ap = qc->ap;
struct scsi_cmnd *scmd = qc->scsicmd;
unsigned long flags;
spin_lock_irqsave(&ap->host_set->lock, flags);
qc->scsidone = ata_eh_scsidone;
__ata_qc_complete(qc);
WARN_ON(ata_tag_valid(qc->tag));
spin_unlock_irqrestore(&ap->host_set->lock, flags);
scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
}
/**
* ata_eh_qc_complete - Complete an active ATA command from EH
* @qc: Command to complete
*
* Indicate to the mid and upper layers that an ATA command has
* completed. To be used from EH.
*/
void ata_eh_qc_complete(struct ata_queued_cmd *qc)
{
struct scsi_cmnd *scmd = qc->scsicmd;
scmd->retries = scmd->allowed;
__ata_eh_qc_complete(qc);
}
/**
* ata_eh_qc_retry - Tell midlayer to retry an ATA command after EH
* @qc: Command to retry
*
* Indicate to the mid and upper layers that an ATA command
* should be retried. To be used from EH.
*
* SCSI midlayer limits the number of retries to scmd->allowed.
* scmd->retries is decremented for commands which get retried
* due to unrelated failures (qc->err_mask is zero).
*/
void ata_eh_qc_retry(struct ata_queued_cmd *qc)
{
struct scsi_cmnd *scmd = qc->scsicmd;
if (!qc->err_mask && scmd->retries)
scmd->retries--;
__ata_eh_qc_complete(qc);
}