2005-11-03 18:52:49 -06:00
/*
* PCI Error Recovery Driver for RPA - compliant PPC64 platform .
* Copyright ( C ) 2004 , 2005 Linas Vepstas < linas @ linas . org >
*
* All rights reserved .
*
* This program is free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or ( at
* your option ) any later version .
*
* This program is distributed in the hope that it will be useful , but
* WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE , GOOD TITLE or
* NON INFRINGEMENT . See the GNU General Public License for more
* details .
*
* You should have received a copy of the GNU General Public License
* along with this program ; if not , write to the Free Software
* Foundation , Inc . , 675 Mass Ave , Cambridge , MA 0213 9 , USA .
*
* Send feedback to < linas @ us . ibm . com >
*
*/
# include <linux/delay.h>
# include <linux/interrupt.h>
2006-04-18 21:05:21 -07:00
# include <linux/irq.h>
2005-11-03 18:52:49 -06:00
# include <linux/pci.h>
# include <asm/eeh.h>
# include <asm/eeh_event.h>
# include <asm/ppc-pci.h>
# include <asm/pci-bridge.h>
# include <asm/prom.h>
# include <asm/rtas.h>
static inline const char * pcid_name ( struct pci_dev * pdev )
{
2006-02-27 15:52:59 +01:00
if ( pdev & & pdev - > dev . driver )
2005-11-03 18:52:49 -06:00
return pdev - > dev . driver - > name ;
return " " ;
}
# ifdef DEBUG
static void print_device_node_tree ( struct pci_dn * pdn , int dent )
{
int i ;
if ( ! pdn ) return ;
for ( i = 0 ; i < dent ; i + + )
printk ( " " ) ;
printk ( " dn=%s mode=%x \t cfg_addr=%x pe_addr=%x \t full=%s \n " ,
pdn - > node - > name , pdn - > eeh_mode , pdn - > eeh_config_addr ,
pdn - > eeh_pe_config_addr , pdn - > node - > full_name ) ;
dent + = 3 ;
struct device_node * pc = pdn - > node - > child ;
while ( pc ) {
print_device_node_tree ( PCI_DN ( pc ) , dent ) ;
pc = pc - > sibling ;
}
}
# endif
/**
* irq_in_use - return true if this irq is being used
*/
static int irq_in_use ( unsigned int irq )
{
int rc = 0 ;
unsigned long flags ;
struct irq_desc * desc = irq_desc + irq ;
spin_lock_irqsave ( & desc - > lock , flags ) ;
if ( desc - > action )
rc = 1 ;
spin_unlock_irqrestore ( & desc - > lock , flags ) ;
return rc ;
}
/* ------------------------------------------------------- */
2006-09-15 18:56:35 -05:00
/**
* eeh_report_error - report pci error to each device driver
*
* Report an EEH error to each device driver , collect up and
* merge the device driver responses . Cumulative response
* passed back in " userdata " .
2005-11-03 18:52:49 -06:00
*/
static void eeh_report_error ( struct pci_dev * dev , void * userdata )
{
2005-11-29 17:17:02 +11:00
enum pci_ers_result rc , * res = userdata ;
2005-11-03 18:52:49 -06:00
struct pci_driver * driver = dev - > driver ;
dev - > error_state = pci_channel_io_frozen ;
if ( ! driver )
return ;
if ( irq_in_use ( dev - > irq ) ) {
struct device_node * dn = pci_device_to_OF_node ( dev ) ;
PCI_DN ( dn ) - > eeh_mode | = EEH_MODE_IRQ_DISABLED ;
disable_irq_nosync ( dev - > irq ) ;
}
2006-09-15 18:58:59 -05:00
if ( ! driver - > err_handler | |
! driver - > err_handler - > error_detected )
2005-11-03 18:52:49 -06:00
return ;
rc = driver - > err_handler - > error_detected ( dev , pci_channel_io_frozen ) ;
2005-11-29 17:17:02 +11:00
if ( * res = = PCI_ERS_RESULT_NONE ) * res = rc ;
2006-09-15 18:58:59 -05:00
if ( * res = = PCI_ERS_RESULT_DISCONNECT & &
rc = = PCI_ERS_RESULT_NEED_RESET ) * res = rc ;
}
/**
* eeh_report_mmio_enabled - tell drivers that MMIO has been enabled
*
* Report an EEH error to each device driver , collect up and
* merge the device driver responses . Cumulative response
* passed back in " userdata " .
*/
static void eeh_report_mmio_enabled ( struct pci_dev * dev , void * userdata )
{
enum pci_ers_result rc , * res = userdata ;
struct pci_driver * driver = dev - > driver ;
// dev->error_state = pci_channel_mmio_enabled;
if ( ! driver | |
! driver - > err_handler | |
! driver - > err_handler - > mmio_enabled )
return ;
rc = driver - > err_handler - > mmio_enabled ( dev ) ;
if ( * res = = PCI_ERS_RESULT_NONE ) * res = rc ;
2005-11-29 17:17:02 +11:00
if ( * res = = PCI_ERS_RESULT_DISCONNECT & &
rc = = PCI_ERS_RESULT_NEED_RESET ) * res = rc ;
2005-11-03 18:52:49 -06:00
}
2006-09-15 18:56:35 -05:00
/**
* eeh_report_reset - tell device that slot has been reset
2005-11-03 18:52:49 -06:00
*/
static void eeh_report_reset ( struct pci_dev * dev , void * userdata )
{
2006-09-15 18:58:59 -05:00
enum pci_ers_result rc , * res = userdata ;
2005-11-03 18:52:49 -06:00
struct pci_driver * driver = dev - > driver ;
struct device_node * dn = pci_device_to_OF_node ( dev ) ;
if ( ! driver )
return ;
if ( ( PCI_DN ( dn ) - > eeh_mode ) & EEH_MODE_IRQ_DISABLED ) {
PCI_DN ( dn ) - > eeh_mode & = ~ EEH_MODE_IRQ_DISABLED ;
enable_irq ( dev - > irq ) ;
}
2006-09-15 18:58:59 -05:00
if ( ! driver - > err_handler | |
! driver - > err_handler - > slot_reset )
2005-11-03 18:52:49 -06:00
return ;
2006-09-15 18:58:59 -05:00
rc = driver - > err_handler - > slot_reset ( dev ) ;
2007-03-19 14:55:51 -05:00
if ( ( * res = = PCI_ERS_RESULT_NONE ) | |
( * res = = PCI_ERS_RESULT_RECOVERED ) ) * res = rc ;
2006-09-15 18:58:59 -05:00
if ( * res = = PCI_ERS_RESULT_DISCONNECT & &
rc = = PCI_ERS_RESULT_NEED_RESET ) * res = rc ;
2005-11-03 18:52:49 -06:00
}
2006-09-15 18:56:35 -05:00
/**
* eeh_report_resume - tell device to resume normal operations
*/
2005-11-03 18:52:49 -06:00
static void eeh_report_resume ( struct pci_dev * dev , void * userdata )
{
struct pci_driver * driver = dev - > driver ;
2006-12-06 12:32:20 -06:00
struct device_node * dn = pci_device_to_OF_node ( dev ) ;
2005-11-03 18:52:49 -06:00
dev - > error_state = pci_channel_io_normal ;
if ( ! driver )
return ;
2006-12-06 12:32:20 -06:00
if ( ( PCI_DN ( dn ) - > eeh_mode ) & EEH_MODE_IRQ_DISABLED ) {
PCI_DN ( dn ) - > eeh_mode & = ~ EEH_MODE_IRQ_DISABLED ;
enable_irq ( dev - > irq ) ;
}
if ( ! driver - > err_handler | |
! driver - > err_handler - > resume )
2005-11-03 18:52:49 -06:00
return ;
driver - > err_handler - > resume ( dev ) ;
}
2006-09-15 18:56:35 -05:00
/**
* eeh_report_failure - tell device driver that device is dead .
*
* This informs the device driver that the device is permanently
* dead , and that no further recovery attempts will be made on it .
*/
2005-11-03 18:52:49 -06:00
static void eeh_report_failure ( struct pci_dev * dev , void * userdata )
{
struct pci_driver * driver = dev - > driver ;
dev - > error_state = pci_channel_io_perm_failure ;
if ( ! driver )
return ;
if ( irq_in_use ( dev - > irq ) ) {
struct device_node * dn = pci_device_to_OF_node ( dev ) ;
PCI_DN ( dn ) - > eeh_mode | = EEH_MODE_IRQ_DISABLED ;
disable_irq_nosync ( dev - > irq ) ;
}
if ( ! driver - > err_handler )
return ;
if ( ! driver - > err_handler - > error_detected )
return ;
driver - > err_handler - > error_detected ( dev , pci_channel_io_perm_failure ) ;
}
/* ------------------------------------------------------- */
/**
* handle_eeh_events - - reset a PCI device after hard lockup .
*
* pSeries systems will isolate a PCI slot if the PCI - Host
* bridge detects address or data parity errors , DMA ' s
2006-06-30 18:27:16 +02:00
* occurring to wild addresses ( which usually happen due to
2005-11-03 18:52:49 -06:00
* bugs in device drivers or in PCI adapter firmware ) .
* Slot isolations also occur if # SERR , # PERR or other misc
* PCI - related errors are detected .
*
* Recovery process consists of unplugging the device driver
* ( which generated hotplug events to userspace ) , then issuing
* a PCI # RST to the device , then reconfiguring the PCI config
* space for all bridges & devices under this slot , and then
* finally restarting the device drivers ( which cause a second
* set of hotplug events to go out to userspace ) .
*/
/**
* eeh_reset_device ( ) - - perform actual reset of a pci slot
2006-09-15 18:56:35 -05:00
* @ bus : pointer to the pci bus structure corresponding
2005-11-03 18:52:49 -06:00
* to the isolated slot . A non - null value will
* cause all devices under the bus to be removed
* and then re - added .
2006-09-15 18:56:35 -05:00
* @ pe_dn : pointer to a " Partionable Endpoint " device node .
2005-11-03 18:52:49 -06:00
* This is the top - level structure on which pci
* bus resets can be performed .
*/
2005-11-03 18:54:54 -06:00
static int eeh_reset_device ( struct pci_dn * pe_dn , struct pci_bus * bus )
2005-11-03 18:52:49 -06:00
{
2007-03-19 15:01:31 -05:00
struct device_node * dn ;
2006-04-28 17:39:38 -05:00
int cnt , rc ;
/* pcibios will clear the counter; save the value */
cnt = pe_dn - > eeh_freeze_count ;
2005-11-03 18:52:49 -06:00
if ( bus )
pcibios_remove_pci_devices ( bus ) ;
/* Reset the pci controller. (Asserts RST#; resets config space).
2005-11-03 18:54:54 -06:00
* Reconfigure bridges and devices . Don ' t try to bring the system
* up if the reset failed for some reason . */
rc = rtas_set_slot_reset ( pe_dn ) ;
if ( rc )
return rc ;
2005-11-03 18:52:49 -06:00
2007-03-19 15:01:31 -05:00
/* Walk over all functions on this device. */
dn = pe_dn - > node ;
if ( ! pcibios_find_pci_bus ( dn ) & & PCI_DN ( dn - > parent ) )
dn = dn - > parent - > child ;
while ( dn ) {
struct pci_dn * ppe = PCI_DN ( dn ) ;
/* On Power4, always true because eeh_pe_config_addr=0 */
if ( pe_dn - > eeh_pe_config_addr = = ppe - > eeh_pe_config_addr ) {
rtas_configure_bridge ( ppe ) ;
eeh_restore_bars ( ppe ) ;
2005-11-03 18:55:01 -06:00
}
2007-03-19 15:01:31 -05:00
dn = dn - > sibling ;
}
2005-11-03 18:52:49 -06:00
/* Give the system 5 seconds to finish running the user-space
* hotplug shutdown scripts , e . g . ifdown for ethernet . Yes ,
* this is a hack , but if we don ' t do this , and try to bring
* the device up before the scripts have taken it down ,
* potentially weird things happen .
*/
if ( bus ) {
ssleep ( 5 ) ;
pcibios_add_pci_devices ( bus ) ;
}
2006-04-28 17:39:38 -05:00
pe_dn - > eeh_freeze_count = cnt ;
2005-11-03 18:54:54 -06:00
return 0 ;
2005-11-03 18:52:49 -06:00
}
/* The longest amount of time to wait for a pci device
* to come back on line , in seconds .
*/
2007-03-19 14:59:59 -05:00
# define MAX_WAIT_FOR_RECOVERY 150
2005-11-03 18:52:49 -06:00
2006-04-18 21:05:21 -07:00
struct pci_dn * handle_eeh_events ( struct eeh_event * event )
2005-11-03 18:52:49 -06:00
{
struct device_node * frozen_dn ;
struct pci_dn * frozen_pdn ;
struct pci_bus * frozen_bus ;
2005-11-03 18:54:54 -06:00
int rc = 0 ;
2005-11-29 17:17:02 +11:00
enum pci_ers_result result = PCI_ERS_RESULT_NONE ;
2006-06-19 15:07:40 -05:00
const char * location , * pci_str , * drv_str ;
2005-11-03 18:52:49 -06:00
frozen_dn = find_device_pe ( event - > dn ) ;
frozen_bus = pcibios_find_pci_bus ( frozen_dn ) ;
if ( ! frozen_dn ) {
2006-06-19 15:07:40 -05:00
2007-04-03 22:26:41 +10:00
location = of_get_property ( event - > dn , " ibm,loc-code " , NULL ) ;
2006-06-19 15:07:40 -05:00
location = location ? location : " unknown " ;
printk ( KERN_ERR " EEH: Error: Cannot find partition endpoint "
" for location=%s pci addr=%s \n " ,
location , pci_name ( event - > dev ) ) ;
2006-04-18 21:05:21 -07:00
return NULL ;
2005-11-03 18:52:49 -06:00
}
2007-04-03 22:26:41 +10:00
location = of_get_property ( frozen_dn , " ibm,loc-code " , NULL ) ;
2006-06-19 15:07:40 -05:00
location = location ? location : " unknown " ;
2005-11-03 18:52:49 -06:00
/* There are two different styles for coming up with the PE.
* In the old style , it was the highest EEH - capable device
* which was always an EADS pci bridge . In the new style ,
* there might not be any EADS bridges , and even when there are ,
* the firmware marks them as " EEH incapable " . So another
* two - step is needed to find the pci bus . . */
if ( ! frozen_bus )
frozen_bus = pcibios_find_pci_bus ( frozen_dn - > parent ) ;
if ( ! frozen_bus ) {
2006-06-19 15:07:40 -05:00
printk ( KERN_ERR " EEH: Cannot find PCI bus "
" for location=%s dn=%s \n " ,
location , frozen_dn - > full_name ) ;
2006-04-18 21:05:21 -07:00
return NULL ;
2005-11-03 18:52:49 -06:00
}
frozen_pdn = PCI_DN ( frozen_dn ) ;
frozen_pdn - > eeh_freeze_count + + ;
2006-03-24 17:11:29 -06:00
2006-03-30 15:27:33 -06:00
if ( frozen_pdn - > pcidev ) {
pci_str = pci_name ( frozen_pdn - > pcidev ) ;
drv_str = pcid_name ( frozen_pdn - > pcidev ) ;
} else {
2006-03-24 17:11:29 -06:00
pci_str = pci_name ( event - > dev ) ;
drv_str = pcid_name ( event - > dev ) ;
}
2005-11-03 18:52:49 -06:00
if ( frozen_pdn - > eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES )
2006-03-29 15:31:04 -06:00
goto excess_failures ;
2005-11-03 18:52:49 -06:00
2007-03-19 14:59:10 -05:00
/* Get the current PCI slot state. */
rc = eeh_wait_for_slot_status ( frozen_pdn , MAX_WAIT_FOR_RECOVERY * 1000 ) ;
if ( rc < 0 ) {
2007-03-19 14:52:04 -05:00
printk ( KERN_WARNING " EEH: Permanent failure \n " ) ;
2005-11-03 18:54:54 -06:00
goto hard_fail ;
2007-03-19 14:52:04 -05:00
}
2005-11-03 18:52:49 -06:00
printk ( KERN_WARNING
2007-05-09 09:34:40 +10:00
" EEH: This PCI device has failed %d times in the last hour: \n " ,
frozen_pdn - > eeh_freeze_count ) ;
printk ( KERN_WARNING
" EEH: location=%s driver=%s pci addr=%s \n " ,
location , drv_str , pci_str ) ;
2005-11-03 18:52:49 -06:00
/* Walk the various device drivers attached to this slot through
* a reset sequence , giving each an opportunity to do what it needs
* to accomplish the reset . Each child gets a report of the
* status . . . if any child can ' t handle the reset , then the entire
* slot is dlpar removed and added .
*/
pci_walk_bus ( frozen_bus , eeh_report_error , & result ) ;
2007-05-09 09:33:29 +10:00
/* Since rtas may enable MMIO when posting the error log,
* don ' t post the error log until after all dev drivers
2007-05-10 02:38:11 +10:00
* have been informed .
*/
eeh_slot_error_detail ( frozen_pdn , EEH_LOG_TEMP_FAILURE ) ;
2007-05-09 09:33:29 +10:00
2005-11-03 18:52:49 -06:00
/* If all device drivers were EEH-unaware, then shut
* down all of the device drivers , and hope they
* go down willingly , without panicing the system .
*/
2005-11-29 17:17:02 +11:00
if ( result = = PCI_ERS_RESULT_NONE ) {
2005-11-03 18:54:54 -06:00
rc = eeh_reset_device ( frozen_pdn , frozen_bus ) ;
2007-03-19 14:52:04 -05:00
if ( rc ) {
printk ( KERN_WARNING " EEH: Unable to reset, rc=%d \n " , rc ) ;
2005-11-03 18:54:54 -06:00
goto hard_fail ;
2007-03-19 14:52:04 -05:00
}
2005-11-03 18:52:49 -06:00
}
2006-09-15 18:58:59 -05:00
/* If all devices reported they can proceed, then re-enable MMIO */
if ( result = = PCI_ERS_RESULT_CAN_RECOVER ) {
rc = rtas_pci_enable ( frozen_pdn , EEH_THAW_MMIO ) ;
2007-03-19 14:59:59 -05:00
if ( rc < 0 )
goto hard_fail ;
2006-09-15 18:58:59 -05:00
if ( rc ) {
result = PCI_ERS_RESULT_NEED_RESET ;
} else {
result = PCI_ERS_RESULT_NONE ;
pci_walk_bus ( frozen_bus , eeh_report_mmio_enabled , & result ) ;
}
2005-11-03 18:52:49 -06:00
}
2006-09-15 18:58:59 -05:00
/* If all devices reported they can proceed, then re-enable DMA */
2005-11-29 17:17:02 +11:00
if ( result = = PCI_ERS_RESULT_CAN_RECOVER ) {
2006-09-15 18:58:59 -05:00
rc = rtas_pci_enable ( frozen_pdn , EEH_THAW_DMA ) ;
2007-03-19 14:59:59 -05:00
if ( rc < 0 )
goto hard_fail ;
2006-09-15 18:58:59 -05:00
if ( rc )
result = PCI_ERS_RESULT_NEED_RESET ;
2006-12-06 12:32:20 -06:00
else
result = PCI_ERS_RESULT_RECOVERED ;
2006-09-15 18:58:59 -05:00
}
/* If any device has a hard failure, then shut off everything. */
2007-03-19 14:52:04 -05:00
if ( result = = PCI_ERS_RESULT_DISCONNECT ) {
printk ( KERN_WARNING " EEH: Device driver gave up \n " ) ;
2006-09-15 18:58:59 -05:00
goto hard_fail ;
2007-03-19 14:52:04 -05:00
}
2006-09-15 18:58:59 -05:00
/* If any device called out for a reset, then reset the slot */
if ( result = = PCI_ERS_RESULT_NEED_RESET ) {
2005-11-03 18:54:54 -06:00
rc = eeh_reset_device ( frozen_pdn , NULL ) ;
2007-03-19 14:52:04 -05:00
if ( rc ) {
printk ( KERN_WARNING " EEH: Cannot reset, rc=%d \n " , rc ) ;
2005-11-03 18:54:54 -06:00
goto hard_fail ;
2007-03-19 14:52:04 -05:00
}
2006-09-15 18:58:59 -05:00
result = PCI_ERS_RESULT_NONE ;
pci_walk_bus ( frozen_bus , eeh_report_reset , & result ) ;
2005-11-03 18:52:49 -06:00
}
2006-09-15 18:58:59 -05:00
/* All devices should claim they have recovered by now. */
2007-03-19 14:55:10 -05:00
if ( ( result ! = PCI_ERS_RESULT_RECOVERED ) & &
( result ! = PCI_ERS_RESULT_NONE ) ) {
2007-03-19 14:52:04 -05:00
printk ( KERN_WARNING " EEH: Not recovered \n " ) ;
2006-09-15 18:58:59 -05:00
goto hard_fail ;
2007-03-19 14:52:04 -05:00
}
2006-09-15 18:58:59 -05:00
2005-11-03 18:52:49 -06:00
/* Tell all device drivers that they can resume operations */
2006-02-01 07:23:24 -05:00
pci_walk_bus ( frozen_bus , eeh_report_resume , NULL ) ;
2005-11-03 18:54:54 -06:00
2006-04-18 21:05:21 -07:00
return frozen_pdn ;
2005-11-03 18:54:54 -06:00
2006-03-29 15:31:04 -06:00
excess_failures :
2005-11-03 18:54:54 -06:00
/*
* About 90 % of all real - life EEH failures in the field
* are due to poorly seated PCI cards . Only 10 % or so are
* due to actual , failed cards .
*/
printk ( KERN_ERR
2006-06-19 15:07:40 -05:00
" EEH: PCI device at location=%s driver=%s pci addr=%s \n "
2006-12-19 13:06:17 -06:00
" has failed %d times in the last hour "
" and has been permanently disabled. \n "
2006-06-19 15:07:40 -05:00
" Please try reseating this device or replacing it. \n " ,
location , drv_str , pci_str , frozen_pdn - > eeh_freeze_count ) ;
2006-03-29 15:31:04 -06:00
goto perm_error ;
hard_fail :
printk ( KERN_ERR
2006-06-19 15:07:40 -05:00
" EEH: Unable to recover from failure of PCI device "
" at location=%s driver=%s pci addr=%s \n "
2006-03-29 15:31:04 -06:00
" Please try reseating this device or replacing it. \n " ,
2006-06-19 15:07:40 -05:00
location , drv_str , pci_str ) ;
2005-11-03 18:54:54 -06:00
2006-03-29 15:31:04 -06:00
perm_error :
2007-05-10 02:38:11 +10:00
eeh_slot_error_detail ( frozen_pdn , EEH_LOG_PERM_FAILURE ) ;
2005-11-03 18:54:54 -06:00
/* Notify all devices that they're about to go down. */
2006-02-01 07:23:24 -05:00
pci_walk_bus ( frozen_bus , eeh_report_failure , NULL ) ;
2005-11-03 18:54:54 -06:00
/* Shut down the device drivers for good. */
pcibios_remove_pci_devices ( frozen_bus ) ;
2006-04-18 21:05:21 -07:00
return NULL ;
2005-11-03 18:52:49 -06:00
}
/* ---------- end of file ---------- */