net/mlx5: Print more info on pci error handlers
In case mlx5_pci_err_detected was called with state equals to pci_channel_io_perm_failure, the driver will never come back up. It is nice to know why the driver went to zombie land, so print some useful information on pci err handlers. Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
This commit is contained in:
@ -1604,12 +1604,28 @@ static void remove_one(struct pci_dev *pdev)
|
|||||||
mlx5_devlink_free(devlink);
|
mlx5_devlink_free(devlink);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define mlx5_pci_trace(dev, fmt, ...) ({ \
|
||||||
|
struct mlx5_core_dev *__dev = (dev); \
|
||||||
|
mlx5_core_info(__dev, "%s Device state = %d health sensors: %d pci_status: %d. " fmt, \
|
||||||
|
__func__, __dev->state, mlx5_health_check_fatal_sensors(__dev), \
|
||||||
|
__dev->pci_status, ##__VA_ARGS__); \
|
||||||
|
})
|
||||||
|
|
||||||
|
static const char *result2str(enum pci_ers_result result)
|
||||||
|
{
|
||||||
|
return result == PCI_ERS_RESULT_NEED_RESET ? "need reset" :
|
||||||
|
result == PCI_ERS_RESULT_DISCONNECT ? "disconnect" :
|
||||||
|
result == PCI_ERS_RESULT_RECOVERED ? "recovered" :
|
||||||
|
"unknown";
|
||||||
|
}
|
||||||
|
|
||||||
static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
|
static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
|
||||||
pci_channel_state_t state)
|
pci_channel_state_t state)
|
||||||
{
|
{
|
||||||
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
||||||
|
enum pci_ers_result res;
|
||||||
|
|
||||||
mlx5_core_info(dev, "%s was called\n", __func__);
|
mlx5_pci_trace(dev, "Enter, pci channel state = %d\n", state);
|
||||||
|
|
||||||
mlx5_enter_error_state(dev, false);
|
mlx5_enter_error_state(dev, false);
|
||||||
mlx5_error_sw_reset(dev);
|
mlx5_error_sw_reset(dev);
|
||||||
@ -1617,8 +1633,11 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev,
|
|||||||
mlx5_drain_health_wq(dev);
|
mlx5_drain_health_wq(dev);
|
||||||
mlx5_pci_disable_device(dev);
|
mlx5_pci_disable_device(dev);
|
||||||
|
|
||||||
return state == pci_channel_io_perm_failure ?
|
res = state == pci_channel_io_perm_failure ?
|
||||||
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
|
PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET;
|
||||||
|
|
||||||
|
mlx5_pci_trace(dev, "Exit, result = %d, %s\n", res, result2str(res));
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* wait for the device to show vital signs by waiting
|
/* wait for the device to show vital signs by waiting
|
||||||
@ -1652,28 +1671,34 @@ static int wait_vital(struct pci_dev *pdev)
|
|||||||
|
|
||||||
static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
|
static pci_ers_result_t mlx5_pci_slot_reset(struct pci_dev *pdev)
|
||||||
{
|
{
|
||||||
|
enum pci_ers_result res = PCI_ERS_RESULT_DISCONNECT;
|
||||||
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
mlx5_core_info(dev, "%s was called\n", __func__);
|
mlx5_pci_trace(dev, "Enter\n");
|
||||||
|
|
||||||
err = mlx5_pci_enable_device(dev);
|
err = mlx5_pci_enable_device(dev);
|
||||||
if (err) {
|
if (err) {
|
||||||
mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
|
mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
|
||||||
__func__, err);
|
__func__, err);
|
||||||
return PCI_ERS_RESULT_DISCONNECT;
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
pci_set_master(pdev);
|
pci_set_master(pdev);
|
||||||
pci_restore_state(pdev);
|
pci_restore_state(pdev);
|
||||||
pci_save_state(pdev);
|
pci_save_state(pdev);
|
||||||
|
|
||||||
if (wait_vital(pdev)) {
|
err = wait_vital(pdev);
|
||||||
mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__);
|
if (err) {
|
||||||
return PCI_ERS_RESULT_DISCONNECT;
|
mlx5_core_err(dev, "%s: wait vital failed with error code: %d\n",
|
||||||
|
__func__, err);
|
||||||
|
goto out;
|
||||||
}
|
}
|
||||||
|
|
||||||
return PCI_ERS_RESULT_RECOVERED;
|
res = PCI_ERS_RESULT_RECOVERED;
|
||||||
|
out:
|
||||||
|
mlx5_pci_trace(dev, "Exit, err = %d, result = %d, %s\n", err, res, result2str(res));
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void mlx5_pci_resume(struct pci_dev *pdev)
|
static void mlx5_pci_resume(struct pci_dev *pdev)
|
||||||
@ -1681,14 +1706,12 @@ static void mlx5_pci_resume(struct pci_dev *pdev)
|
|||||||
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
mlx5_core_info(dev, "%s was called\n", __func__);
|
mlx5_pci_trace(dev, "Enter, loading driver..\n");
|
||||||
|
|
||||||
err = mlx5_load_one(dev);
|
err = mlx5_load_one(dev);
|
||||||
if (err)
|
|
||||||
mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n",
|
mlx5_pci_trace(dev, "Done, err = %d, device %s\n", err,
|
||||||
__func__, err);
|
!err ? "recovered" : "Failed");
|
||||||
else
|
|
||||||
mlx5_core_info(dev, "%s: device recovered\n", __func__);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static const struct pci_error_handlers mlx5_err_handler = {
|
static const struct pci_error_handlers mlx5_err_handler = {
|
||||||
|
Reference in New Issue
Block a user