net: hns3: add handling of RDMA RAS errors
This patch handles the RDMA RAS errors. 1. Enable RAS interrupt, print error detail info and clear error status. 2. Do CORE reset to recovery when these non-fatal errors happened. Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com> Signed-off-by: Shiju Jose <shiju.jose@huawei.com> Signed-off-by: Salil Mehta <salil.mehta@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
c3529177db
commit
630ba007f4
@ -229,6 +229,9 @@ enum hclge_opcode_type {
|
||||
HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
|
||||
HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
|
||||
HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
|
||||
HCLGE_CONFIG_ROCEE_RAS_INT_EN = 0x1580,
|
||||
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
|
||||
HCLGE_ROCEE_PF_RAS_INT_CMD = 0x1584,
|
||||
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
|
||||
HCLGE_IGU_COMMON_INT_EN = 0x1806,
|
||||
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
|
||||
|
@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
|
||||
{ /* sentinel */ }
|
||||
};
|
||||
|
||||
static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
|
||||
{ .int_msk = 0, .msg = "rocee qmm ovf: sgid invalid err" },
|
||||
{ .int_msk = 0x4, .msg = "rocee qmm ovf: sgid ovf err" },
|
||||
{ .int_msk = 0x8, .msg = "rocee qmm ovf: smac invalid err" },
|
||||
{ .int_msk = 0xC, .msg = "rocee qmm ovf: smac ovf err" },
|
||||
{ .int_msk = 0x10, .msg = "rocee qmm ovf: cqc invalid err" },
|
||||
{ .int_msk = 0x11, .msg = "rocee qmm ovf: cqc ovf err" },
|
||||
{ .int_msk = 0x12, .msg = "rocee qmm ovf: cqc hopnum err" },
|
||||
{ .int_msk = 0x13, .msg = "rocee qmm ovf: cqc ba0 err" },
|
||||
{ .int_msk = 0x14, .msg = "rocee qmm ovf: srqc invalid err" },
|
||||
{ .int_msk = 0x15, .msg = "rocee qmm ovf: srqc ovf err" },
|
||||
{ .int_msk = 0x16, .msg = "rocee qmm ovf: srqc hopnum err" },
|
||||
{ .int_msk = 0x17, .msg = "rocee qmm ovf: srqc ba0 err" },
|
||||
{ .int_msk = 0x18, .msg = "rocee qmm ovf: mpt invalid err" },
|
||||
{ .int_msk = 0x19, .msg = "rocee qmm ovf: mpt ovf err" },
|
||||
{ .int_msk = 0x1A, .msg = "rocee qmm ovf: mpt hopnum err" },
|
||||
{ .int_msk = 0x1B, .msg = "rocee qmm ovf: mpt ba0 err" },
|
||||
{ .int_msk = 0x1C, .msg = "rocee qmm ovf: qpc invalid err" },
|
||||
{ .int_msk = 0x1D, .msg = "rocee qmm ovf: qpc ovf err" },
|
||||
{ .int_msk = 0x1E, .msg = "rocee qmm ovf: qpc hopnum err" },
|
||||
{ .int_msk = 0x1F, .msg = "rocee qmm ovf: qpc ba0 err" },
|
||||
{ /* sentinel */ }
|
||||
};
|
||||
|
||||
static void hclge_log_error(struct device *dev, char *reg,
|
||||
const struct hclge_hw_error *err,
|
||||
u32 err_sts)
|
||||
@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev)
|
||||
{
|
||||
struct device *dev = &hdev->pdev->dev;
|
||||
struct hclge_desc desc[2];
|
||||
int ret;
|
||||
|
||||
/* read overflow error status */
|
||||
ret = hclge_cmd_query_error(hdev, &desc[0],
|
||||
HCLGE_ROCEE_PF_RAS_INT_CMD,
|
||||
0, 0, 0);
|
||||
if (ret) {
|
||||
dev_err(dev, "failed(%d) to query ROCEE OVF error sts\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* log overflow error */
|
||||
if (le32_to_cpu(desc[0].data[0]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
|
||||
const struct hclge_hw_error *err;
|
||||
u32 err_sts;
|
||||
|
||||
err = &hclge_rocee_qmm_ovf_err_int[0];
|
||||
err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
|
||||
le32_to_cpu(desc[0].data[0]);
|
||||
while (err->msg) {
|
||||
if (err->int_msk == err_sts) {
|
||||
dev_warn(dev, "%s [error status=0x%x] found\n",
|
||||
err->msg,
|
||||
le32_to_cpu(desc[0].data[0]));
|
||||
break;
|
||||
}
|
||||
err++;
|
||||
}
|
||||
}
|
||||
|
||||
if (le32_to_cpu(desc[0].data[1]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
|
||||
dev_warn(dev, "ROCEE TSP OVF [error status=0x%x] found\n",
|
||||
le32_to_cpu(desc[0].data[1]));
|
||||
}
|
||||
|
||||
if (le32_to_cpu(desc[0].data[2]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
|
||||
dev_warn(dev, "ROCEE SCC OVF [error status=0x%x] found\n",
|
||||
le32_to_cpu(desc[0].data[2]));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev)
|
||||
{
|
||||
enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET;
|
||||
struct hnae3_ae_dev *ae_dev = hdev->ae_dev;
|
||||
struct device *dev = &hdev->pdev->dev;
|
||||
struct hclge_desc desc[2];
|
||||
unsigned int status;
|
||||
int ret;
|
||||
|
||||
/* read RAS error interrupt status */
|
||||
ret = hclge_cmd_query_error(hdev, &desc[0],
|
||||
HCLGE_QUERY_CLEAR_ROCEE_RAS_INT,
|
||||
0, 0, 0);
|
||||
if (ret) {
|
||||
dev_err(dev, "failed(%d) to query ROCEE RAS INT SRC\n", ret);
|
||||
/* reset everything for now */
|
||||
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
|
||||
return ret;
|
||||
}
|
||||
|
||||
status = le32_to_cpu(desc[0].data[0]);
|
||||
|
||||
if (status & HCLGE_ROCEE_RERR_INT_MASK)
|
||||
dev_warn(dev, "ROCEE RAS AXI rresp error\n");
|
||||
|
||||
if (status & HCLGE_ROCEE_BERR_INT_MASK)
|
||||
dev_warn(dev, "ROCEE RAS AXI bresp error\n");
|
||||
|
||||
if (status & HCLGE_ROCEE_ECC_INT_MASK) {
|
||||
dev_warn(dev, "ROCEE RAS 2bit ECC error\n");
|
||||
reset_type = HNAE3_GLOBAL_RESET;
|
||||
}
|
||||
|
||||
if (status & HCLGE_ROCEE_OVF_INT_MASK) {
|
||||
ret = hclge_log_rocee_ovf_error(hdev);
|
||||
if (ret) {
|
||||
dev_err(dev, "failed(%d) to process ovf error\n", ret);
|
||||
/* reset everything for now */
|
||||
HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
/* clear error status */
|
||||
hclge_cmd_reuse_desc(&desc[0], false);
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
|
||||
if (ret) {
|
||||
dev_err(dev, "failed(%d) to clear ROCEE RAS error\n", ret);
|
||||
/* reset everything for now */
|
||||
reset_type = HNAE3_GLOBAL_RESET;
|
||||
}
|
||||
|
||||
HCLGE_SET_DEFAULT_RESET_REQUEST(reset_type);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en)
|
||||
{
|
||||
struct device *dev = &hdev->pdev->dev;
|
||||
struct hclge_desc desc;
|
||||
int ret;
|
||||
|
||||
if (hdev->pdev->revision < 0x21 || !hnae3_dev_roce_supported(hdev))
|
||||
return 0;
|
||||
|
||||
hclge_cmd_setup_basic_desc(&desc, HCLGE_CONFIG_ROCEE_RAS_INT_EN, false);
|
||||
if (en) {
|
||||
/* enable ROCEE hw error interrupts */
|
||||
desc.data[0] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN);
|
||||
desc.data[1] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN);
|
||||
|
||||
hclge_log_and_clear_rocee_ras_error(hdev);
|
||||
}
|
||||
desc.data[2] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN_MASK);
|
||||
desc.data[3] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN_MASK);
|
||||
|
||||
ret = hclge_cmd_send(&hdev->hw, &desc, 1);
|
||||
if (ret)
|
||||
dev_err(dev, "failed(%d) to config ROCEE RAS interrupt\n", ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int hclge_handle_rocee_ras_error(struct hnae3_ae_dev *ae_dev)
|
||||
{
|
||||
struct hclge_dev *hdev = ae_dev->priv;
|
||||
|
||||
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
|
||||
hdev->pdev->revision < 0x21)
|
||||
return HNAE3_NONE_RESET;
|
||||
|
||||
return hclge_log_and_clear_rocee_ras_error(hdev);
|
||||
}
|
||||
|
||||
static const struct hclge_hw_blk hw_blk[] = {
|
||||
{
|
||||
.msk = BIT(0), .name = "IGU_EGU",
|
||||
@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
|
||||
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
|
||||
{
|
||||
const struct hclge_hw_blk *module = hw_blk;
|
||||
struct device *dev = &hdev->pdev->dev;
|
||||
int ret = 0;
|
||||
|
||||
while (module->name) {
|
||||
@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
|
||||
module++;
|
||||
}
|
||||
|
||||
ret = hclge_config_rocee_ras_interrupt(hdev, state);
|
||||
if (ret)
|
||||
dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
|
||||
"HNS Non-Fatal RAS error(status=0x%x) identified\n",
|
||||
status);
|
||||
hclge_handle_all_ras_errors(hdev);
|
||||
return PCI_ERS_RESULT_NEED_RESET;
|
||||
} else {
|
||||
if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
|
||||
hdev->pdev->revision < 0x21)
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
}
|
||||
|
||||
if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
|
||||
dev_warn(dev, "ROCEE uncorrected RAS error identified\n");
|
||||
hclge_handle_rocee_ras_error(ae_dev);
|
||||
}
|
||||
|
||||
if (status & HCLGE_RAS_REG_NFE_MASK ||
|
||||
status & HCLGE_RAS_REG_ROCEE_ERR_MASK)
|
||||
return PCI_ERS_RESULT_NEED_RESET;
|
||||
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
|
||||
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
|
||||
#define HCLGE_RAS_REG_ROCEE_ERR_MASK 0x3000000
|
||||
|
||||
#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
|
||||
#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
|
||||
@ -83,6 +84,17 @@
|
||||
#define HCLGE_QCN_ECC_INT_MASK GENMASK(21, 0)
|
||||
#define HCLGE_NCSI_ECC_INT_MASK GENMASK(1, 0)
|
||||
|
||||
#define HCLGE_ROCEE_RAS_NFE_INT_EN 0xF
|
||||
#define HCLGE_ROCEE_RAS_CE_INT_EN 0x1
|
||||
#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK 0xF
|
||||
#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK 0x1
|
||||
#define HCLGE_ROCEE_RERR_INT_MASK BIT(0)
|
||||
#define HCLGE_ROCEE_BERR_INT_MASK BIT(1)
|
||||
#define HCLGE_ROCEE_ECC_INT_MASK BIT(2)
|
||||
#define HCLGE_ROCEE_OVF_INT_MASK BIT(3)
|
||||
#define HCLGE_ROCEE_OVF_ERR_INT_MASK 0x10000
|
||||
#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK 0x3F
|
||||
|
||||
enum hclge_err_int_type {
|
||||
HCLGE_ERR_INT_MSIX = 0,
|
||||
HCLGE_ERR_INT_RAS_CE = 1,
|
||||
|
Loading…
Reference in New Issue
Block a user