Merge branch 'hns3-error-handling'
Salil Mehta says: ==================== net: hns3: Additions/optimizations related to HNS3 H/W err handling This patch set primarily does following addtions and optimizations related to error handling in HNS3 Ethernet driver: 1. Name changes for enable and process functions and minor loop optimizations. [PATCH 1-6] 2. Modify query and clearing of RAS errors using new set of commands because modules specific commands for clearing RCB PPP PF, SSU are obselete. [PATCH 7] 3. Deletes logging 1-bit errors for RAS in HNS3 driver as these never get reported to the driver. [PATCH 8] 4. Add handling of NIC hw errors reported through MSIx rather than PCIe AER channel. [PATCH 9] 5. Add handling for the HW RAS and MSIx errors in the modules MAC, PPP PF, MSIx SRAM, RCB and SSU. [PATCH 10-13] 6. Add handling of RoCEE RAS errors. [PATCH 14] ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
12edfdfc79
@ -136,6 +136,7 @@ enum hnae3_reset_type {
|
||||
HNAE3_CORE_RESET,
|
||||
HNAE3_GLOBAL_RESET,
|
||||
HNAE3_IMP_RESET,
|
||||
HNAE3_UNKNOWN_RESET,
|
||||
HNAE3_NONE_RESET,
|
||||
};
|
||||
|
||||
@ -454,7 +455,7 @@ struct hnae3_ae_ops {
|
||||
int (*restore_fd_rules)(struct hnae3_handle *handle);
|
||||
void (*enable_fd)(struct hnae3_handle *handle, bool enable);
|
||||
int (*dbg_run_cmd)(struct hnae3_handle *handle, char *cmd_buf);
|
||||
pci_ers_result_t (*process_hw_error)(struct hnae3_ae_dev *ae_dev);
|
||||
pci_ers_result_t (*handle_hw_ras_error)(struct hnae3_ae_dev *ae_dev);
|
||||
bool (*get_hw_reset_stat)(struct hnae3_handle *handle);
|
||||
bool (*ae_dev_resetting)(struct hnae3_handle *handle);
|
||||
unsigned long (*ae_dev_reset_cnt)(struct hnae3_handle *handle);
|
||||
|
@ -1828,8 +1828,8 @@ static pci_ers_result_t hns3_error_detected(struct pci_dev *pdev,
|
||||
return PCI_ERS_RESULT_NONE;
|
||||
}
|
||||
|
||||
if (ae_dev->ops->process_hw_error)
|
||||
ret = ae_dev->ops->process_hw_error(ae_dev);
|
||||
if (ae_dev->ops->handle_hw_ras_error)
|
||||
ret = ae_dev->ops->handle_hw_ras_error(ae_dev);
|
||||
else
|
||||
return PCI_ERS_RESULT_NONE;
|
||||
|
||||
|
@ -215,26 +215,29 @@ enum hclge_opcode_type {
|
||||
HCLGE_OPC_SFP_GET_SPEED = 0x7104,
|
||||
|
||||
/* Error INT commands */
|
||||
HCLGE_MAC_COMMON_INT_EN = 0x030E,
|
||||
HCLGE_TM_SCH_ECC_INT_EN = 0x0829,
|
||||
HCLGE_TM_SCH_ECC_ERR_RINT_CMD = 0x082d,
|
||||
HCLGE_TM_SCH_ECC_ERR_RINT_CE = 0x082f,
|
||||
HCLGE_TM_SCH_ECC_ERR_RINT_NFE = 0x0830,
|
||||
HCLGE_TM_SCH_ECC_ERR_RINT_FE = 0x0831,
|
||||
HCLGE_TM_SCH_MBIT_ECC_INFO_CMD = 0x0833,
|
||||
HCLGE_SSU_ECC_INT_CMD = 0x0989,
|
||||
HCLGE_SSU_COMMON_INT_CMD = 0x098C,
|
||||
HCLGE_PPU_MPF_ECC_INT_CMD = 0x0B40,
|
||||
HCLGE_PPU_MPF_OTHER_INT_CMD = 0x0B41,
|
||||
HCLGE_PPU_PF_OTHER_INT_CMD = 0x0B42,
|
||||
HCLGE_COMMON_ECC_INT_CFG = 0x1505,
|
||||
HCLGE_IGU_EGU_TNL_INT_QUERY = 0x1802,
|
||||
HCLGE_QUERY_RAS_INT_STS_BD_NUM = 0x1510,
|
||||
HCLGE_QUERY_CLEAR_MPF_RAS_INT = 0x1511,
|
||||
HCLGE_QUERY_CLEAR_PF_RAS_INT = 0x1512,
|
||||
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
|
||||
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
|
||||
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
|
||||
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
|
||||
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
|
||||
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
|
||||
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
|
||||
HCLGE_IGU_EGU_TNL_INT_CLR = 0x1804,
|
||||
HCLGE_IGU_COMMON_INT_QUERY = 0x1805,
|
||||
HCLGE_IGU_COMMON_INT_EN = 0x1806,
|
||||
HCLGE_IGU_COMMON_INT_CLR = 0x1807,
|
||||
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
|
||||
HCLGE_TM_QCN_MEM_INT_INFO_CMD = 0x1A17,
|
||||
HCLGE_PPP_CMD0_INT_CMD = 0x2100,
|
||||
HCLGE_PPP_CMD1_INT_CMD = 0x2101,
|
||||
HCLGE_NCSI_INT_QUERY = 0x2400,
|
||||
HCLGE_NCSI_INT_EN = 0x2401,
|
||||
HCLGE_NCSI_INT_CLR = 0x2402,
|
||||
};
|
||||
|
||||
#define HCLGE_TQP_REG_OFFSET 0x80000
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -7,9 +7,11 @@
|
||||
#include "hclge_main.h"
|
||||
|
||||
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
|
||||
#define HCLGE_RAS_REG_FE_MASK 0xFF
|
||||
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
|
||||
#define HCLGE_RAS_REG_NFE_SHIFT 8
|
||||
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
|
||||
|
||||
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
|
||||
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
|
||||
|
||||
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN 0xFFFF0000
|
||||
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK 0xFFFF0000
|
||||
@ -23,6 +25,8 @@
|
||||
#define HCLGE_IMP_RD_POISON_ERR_INT_EN_MASK 0x0100
|
||||
#define HCLGE_TQP_ECC_ERR_INT_EN 0x0FFF
|
||||
#define HCLGE_TQP_ECC_ERR_INT_EN_MASK 0x0FFF
|
||||
#define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN_MASK 0x0F000000
|
||||
#define HCLGE_MSIX_SRAM_ECC_ERR_INT_EN 0x0F000000
|
||||
#define HCLGE_IGU_ERR_INT_EN 0x0000066F
|
||||
#define HCLGE_IGU_ERR_INT_EN_MASK 0x000F
|
||||
#define HCLGE_IGU_TNL_ERR_INT_EN 0x0002AABF
|
||||
@ -41,21 +45,55 @@
|
||||
#define HCLGE_TM_QCN_MEM_ERR_INT_EN 0xFFFFFF
|
||||
#define HCLGE_NCSI_ERR_INT_EN 0x3
|
||||
#define HCLGE_NCSI_ERR_INT_TYPE 0x9
|
||||
#define HCLGE_MAC_COMMON_ERR_INT_EN GENMASK(7, 0)
|
||||
#define HCLGE_MAC_COMMON_ERR_INT_EN_MASK GENMASK(7, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT0_EN GENMASK(31, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT0_EN_MASK GENMASK(31, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT1_EN GENMASK(31, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT1_EN_MASK GENMASK(31, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN 0x3FFF3FFF
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN_MASK 0x3FFF3FFF
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN2 0xB
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT2_EN2_MASK 0xB
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT3_EN GENMASK(7, 0)
|
||||
#define HCLGE_PPU_MPF_ABNORMAL_INT3_EN_MASK GENMASK(23, 16)
|
||||
#define HCLGE_PPU_PF_ABNORMAL_INT_EN GENMASK(5, 0)
|
||||
#define HCLGE_PPU_PF_ABNORMAL_INT_EN_MASK GENMASK(5, 0)
|
||||
#define HCLGE_SSU_1BIT_ECC_ERR_INT_EN GENMASK(31, 0)
|
||||
#define HCLGE_SSU_1BIT_ECC_ERR_INT_EN_MASK GENMASK(31, 0)
|
||||
#define HCLGE_SSU_MULTI_BIT_ECC_ERR_INT_EN GENMASK(31, 0)
|
||||
#define HCLGE_SSU_MULTI_BIT_ECC_ERR_INT_EN_MASK GENMASK(31, 0)
|
||||
#define HCLGE_SSU_BIT32_ECC_ERR_INT_EN 0x0101
|
||||
#define HCLGE_SSU_BIT32_ECC_ERR_INT_EN_MASK 0x0101
|
||||
#define HCLGE_SSU_COMMON_INT_EN GENMASK(9, 0)
|
||||
#define HCLGE_SSU_COMMON_INT_EN_MASK GENMASK(9, 0)
|
||||
#define HCLGE_SSU_PORT_BASED_ERR_INT_EN 0x0BFF
|
||||
#define HCLGE_SSU_PORT_BASED_ERR_INT_EN_MASK 0x0BFF0000
|
||||
#define HCLGE_SSU_FIFO_OVERFLOW_ERR_INT_EN GENMASK(23, 0)
|
||||
#define HCLGE_SSU_FIFO_OVERFLOW_ERR_INT_EN_MASK GENMASK(23, 0)
|
||||
|
||||
#define HCLGE_IMP_TCM_ECC_INT_MASK 0xFFFF
|
||||
#define HCLGE_IMP_ITCM4_ECC_INT_MASK 0x3
|
||||
#define HCLGE_CMDQ_ECC_INT_MASK 0xFFFF
|
||||
#define HCLGE_CMDQ_ROC_ECC_INT_SHIFT 16
|
||||
#define HCLGE_TQP_ECC_INT_MASK 0xFFF
|
||||
#define HCLGE_TQP_ECC_INT_SHIFT 16
|
||||
#define HCLGE_IMP_TCM_ECC_CLR_MASK 0xFFFF
|
||||
#define HCLGE_IMP_ITCM4_ECC_CLR_MASK 0x3
|
||||
#define HCLGE_CMDQ_NIC_ECC_CLR_MASK 0xFFFF
|
||||
#define HCLGE_CMDQ_ROCEE_ECC_CLR_MASK 0xFFFF0000
|
||||
#define HCLGE_TQP_IMP_ERR_CLR_MASK 0x0FFF0001
|
||||
#define HCLGE_IGU_COM_INT_MASK 0xF
|
||||
#define HCLGE_IGU_EGU_TNL_INT_MASK 0x3F
|
||||
#define HCLGE_PPP_PF_INT_MASK 0x100
|
||||
#define HCLGE_SSU_COMMON_ERR_INT_MASK GENMASK(9, 0)
|
||||
#define HCLGE_SSU_PORT_INT_MSIX_MASK 0x7BFF
|
||||
#define HCLGE_IGU_INT_MASK GENMASK(3, 0)
|
||||
#define HCLGE_IGU_EGU_TNL_INT_MASK GENMASK(5, 0)
|
||||
#define HCLGE_PPP_MPF_INT_ST3_MASK GENMASK(5, 0)
|
||||
#define HCLGE_PPU_MPF_INT_ST3_MASK GENMASK(7, 0)
|
||||
#define HCLGE_PPU_MPF_INT_ST2_MSIX_MASK GENMASK(29, 28)
|
||||
#define HCLGE_PPU_PF_INT_MSIX_MASK 0x27
|
||||
#define HCLGE_QCN_FIFO_INT_MASK GENMASK(17, 0)
|
||||
#define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0)
|
||||
#define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0)
|
||||
|
||||
#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF
|
||||
#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1
|
||||
#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF
|
||||
#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1
|
||||
#define HCLGE_ROCEE_RERR_INT_MASK BIT(0)
|
||||
#define HCLGE_ROCEE_BERR_INT_MASK BIT(1)
|
||||
#define HCLGE_ROCEE_ECC_INT_MASK BIT(2)
|
||||
#define HCLGE_ROCEE_OVF_INT_MASK BIT(3)
|
||||
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
|
||||
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
|
||||
|
||||
enum hclge_err_int_type {
|
||||
HCLGE_ERR_INT_MSIX = 0,
|
||||
@ -67,9 +105,7 @@ enum hclge_err_int_type {
|
||||
struct hclge_hw_blk {
|
||||
u32 msk;
|
||||
const char *name;
|
||||
int (*enable_error)(struct hclge_dev *hdev, bool en);
|
||||
void (*process_error)(struct hclge_dev *hdev,
|
||||
enum hclge_err_int_type type);
|
||||
int (*config_err_int)(struct hclge_dev *hdev, bool en);
|
||||
};
|
||||
|
||||
struct hclge_hw_error {
|
||||
@ -78,6 +114,7 @@ struct hclge_hw_error {
|
||||
};
|
||||
|
||||
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
|
||||
int hclge_enable_tm_hw_error(struct hclge_dev *hdev, bool en);
|
||||
pci_ers_result_t hclge_process_ras_hw_error(struct hnae3_ae_dev *ae_dev);
|
||||
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
|
||||
int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
|
||||
unsigned long *reset_requests);
|
||||
#endif
|
||||
|
@ -2200,12 +2200,13 @@ static void hclge_service_complete(struct hclge_dev *hdev)
|
||||
|
||||
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
|
||||
{
|
||||
u32 rst_src_reg;
|
||||
u32 cmdq_src_reg;
|
||||
u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
|
||||
|
||||
/* fetch the events from their corresponding regs */
|
||||
rst_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
|
||||
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
|
||||
msix_src_reg = hclge_read_dev(&hdev->hw,
|
||||
HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
|
||||
|
||||
/* Assumption: If by any chance reset and mailbox events are reported
|
||||
* together then we will only process reset event in this go and will
|
||||
@ -2239,6 +2240,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
|
||||
return HCLGE_VECTOR0_EVENT_RST;
|
||||
}
|
||||
|
||||
/* check for vector0 msix event source */
|
||||
if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK)
|
||||
return HCLGE_VECTOR0_EVENT_ERR;
|
||||
|
||||
/* check for vector0 mailbox(=CMDQ RX) event source */
|
||||
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
|
||||
cmdq_src_reg &= ~BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B);
|
||||
@ -2289,6 +2294,19 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
|
||||
|
||||
/* vector 0 interrupt is shared with reset and mailbox source events.*/
|
||||
switch (event_cause) {
|
||||
case HCLGE_VECTOR0_EVENT_ERR:
|
||||
/* we do not know what type of reset is required now. This could
|
||||
* only be decided after we fetch the type of errors which
|
||||
* caused this event. Therefore, we will do below for now:
|
||||
* 1. Assert HNAE3_UNKNOWN_RESET type of reset. This means we
|
||||
* have defered type of reset to be used.
|
||||
* 2. Schedule the reset serivce task.
|
||||
* 3. When service task receives HNAE3_UNKNOWN_RESET type it
|
||||
* will fetch the correct type of reset. This would be done
|
||||
* by first decoding the types of errors.
|
||||
*/
|
||||
set_bit(HNAE3_UNKNOWN_RESET, &hdev->reset_request);
|
||||
/* fall through */
|
||||
case HCLGE_VECTOR0_EVENT_RST:
|
||||
hclge_reset_task_schedule(hdev);
|
||||
break;
|
||||
@ -2593,6 +2611,23 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
|
||||
{
|
||||
enum hnae3_reset_type rst_level = HNAE3_NONE_RESET;
|
||||
|
||||
/* first, resolve any unknown reset type to the known type(s) */
|
||||
if (test_bit(HNAE3_UNKNOWN_RESET, addr)) {
|
||||
/* we will intentionally ignore any errors from this function
|
||||
* as we will end up in *some* reset request in any case
|
||||
*/
|
||||
hclge_handle_hw_msix_error(hdev, addr);
|
||||
clear_bit(HNAE3_UNKNOWN_RESET, addr);
|
||||
/* We defered the clearing of the error event which caused
|
||||
* interrupt since it was not posssible to do that in
|
||||
* interrupt context (and this is the reason we introduced
|
||||
* new UNKNOWN reset type). Now, the errors have been
|
||||
* handled and cleared in hardware we can safely enable
|
||||
* interrupts. This is an exception to the norm.
|
||||
*/
|
||||
hclge_enable_vector(&hdev->misc_vector, true);
|
||||
}
|
||||
|
||||
/* return the highest priority reset level amongst all */
|
||||
if (test_bit(HNAE3_IMP_RESET, addr)) {
|
||||
rst_level = HNAE3_IMP_RESET;
|
||||
@ -7269,7 +7304,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
|
||||
ret = hclge_hw_error_set_state(hdev, true);
|
||||
if (ret) {
|
||||
dev_err(&pdev->dev,
|
||||
"hw error interrupts enable failed, ret =%d\n", ret);
|
||||
"fail(%d) to enable hw error interrupts\n", ret);
|
||||
goto err_mdiobus_unreg;
|
||||
}
|
||||
|
||||
@ -7405,11 +7440,15 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Re-enable the TM hw error interrupts because
|
||||
* they get disabled on core/global reset.
|
||||
/* Re-enable the hw error interrupts because
|
||||
* the interrupts get disabled on core/global reset.
|
||||
*/
|
||||
if (hclge_enable_tm_hw_error(hdev, true))
|
||||
dev_err(&pdev->dev, "failed to enable TM hw error interrupts\n");
|
||||
ret = hclge_hw_error_set_state(hdev, true);
|
||||
if (ret) {
|
||||
dev_err(&pdev->dev,
|
||||
"fail(%d) to re-enable HNS hw error interrupts\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
hclge_reset_vport_state(hdev);
|
||||
|
||||
@ -7931,7 +7970,7 @@ static const struct hnae3_ae_ops hclge_ops = {
|
||||
.restore_fd_rules = hclge_restore_fd_entries,
|
||||
.enable_fd = hclge_enable_fd,
|
||||
.dbg_run_cmd = hclge_dbg_run_cmd,
|
||||
.process_hw_error = hclge_process_ras_hw_error,
|
||||
.handle_hw_ras_error = hclge_handle_hw_ras_error,
|
||||
.get_hw_reset_stat = hclge_get_hw_reset_stat,
|
||||
.ae_dev_resetting = hclge_ae_dev_resetting,
|
||||
.ae_dev_reset_cnt = hclge_ae_dev_reset_cnt,
|
||||
|
@ -205,6 +205,7 @@ enum HCLGE_DEV_STATE {
|
||||
enum hclge_evt_cause {
|
||||
HCLGE_VECTOR0_EVENT_RST,
|
||||
HCLGE_VECTOR0_EVENT_MBX,
|
||||
HCLGE_VECTOR0_EVENT_ERR,
|
||||
HCLGE_VECTOR0_EVENT_OTHER,
|
||||
};
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user