Merge branch 'mlxsw-devlink=health-reporter-extensions'

Ido Schimmel says:

====================
mlxsw: devlink health reporter extensions

This patchset extends the devlink health reporter registered by mlxsw to
report new health events and their related parameters. These are meant
to aid in debugging hardware and firmware issues.

Patches #1-#2 are preparations.

Patch #3 adds the definitions of the new events and parameters.

Patch #4 extends the health reporter to report the new events and
parameters.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-12-20 11:32:22 +00:00
commit 5f89b38900
2 changed files with 283 additions and 30 deletions

View File

@ -1708,6 +1708,124 @@ static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg,
static const struct mlxsw_listener mlxsw_core_health_listener =
MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
static int
mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val, tile_v;
int err;
val = mlxsw_reg_mfde_fatal_cause_id_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "cause_id", val);
if (err)
return err;
tile_v = mlxsw_reg_mfde_fatal_cause_tile_v_get(mfde_pl);
if (tile_v) {
val = mlxsw_reg_mfde_fatal_cause_tile_index_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
if (err)
return err;
}
return 0;
}
static int
mlxsw_core_health_fw_fatal_dump_fw_assert(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val, tile_v;
int err;
val = mlxsw_reg_mfde_fw_assert_var0_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var0", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var1_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var1", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var2_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var2", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var3_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var3", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var4_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var4", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_existptr_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "existptr", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_callra_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "callra", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
tile_v = mlxsw_reg_mfde_fw_assert_tile_v_get(mfde_pl);
if (tile_v) {
val = mlxsw_reg_mfde_fw_assert_tile_index_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
if (err)
return err;
}
val = mlxsw_reg_mfde_fw_assert_ext_synd_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", val);
if (err)
return err;
return 0;
}
static int
mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val;
int err;
val = mlxsw_reg_mfde_kvd_im_stop_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl);
return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
}
static int
mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val;
int err;
val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
if (err)
return err;
val = mlxsw_reg_mfde_crspace_to_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val);
if (err)
return err;
val = mlxsw_reg_mfde_crspace_to_log_ip_get(mfde_pl);
err = devlink_fmsg_u64_pair_put(fmsg, "log_ip", val);
if (err)
return err;
return 0;
}
static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *reporter,
struct devlink_fmsg *fmsg, void *priv_ctx,
struct netlink_ext_ack *extack)
@ -1741,6 +1859,15 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
val_str = "KVD insertion machine stopped";
break;
case MLXSW_REG_MFDE_EVENT_ID_TEST:
val_str = "Test";
break;
case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
val_str = "FW assert";
break;
case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
val_str = "Fatal cause";
break;
default:
val_str = NULL;
}
@ -1749,6 +1876,38 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
if (err)
return err;
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
err = devlink_fmsg_arr_pair_nest_start(fmsg, "severity");
if (err)
return err;
val = mlxsw_reg_mfde_severity_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "id", val);
if (err)
return err;
switch (val) {
case MLXSW_REG_MFDE_SEVERITY_FATL:
val_str = "Fatal";
break;
case MLXSW_REG_MFDE_SEVERITY_NRML:
val_str = "Normal";
break;
case MLXSW_REG_MFDE_SEVERITY_INTR:
val_str = "Debug";
break;
default:
val_str = NULL;
}
if (val_str) {
err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str);
if (err)
return err;
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
@ -1800,24 +1959,18 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
if (err)
return err;
if (event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO) {
val = mlxsw_reg_mfde_log_address_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
if (err)
return err;
val = mlxsw_reg_mfde_log_id_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val);
if (err)
return err;
val = mlxsw_reg_mfde_log_ip_get(mfde_pl);
err = devlink_fmsg_u64_pair_put(fmsg, "log_ip", val);
if (err)
return err;
} else if (event_id == MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP) {
val = mlxsw_reg_mfde_pipes_mask_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
if (err)
return err;
switch (event_id) {
case MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO:
return mlxsw_core_health_fw_fatal_dump_crspace_to(mfde_pl,
fmsg);
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl,
fmsg);
case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
return mlxsw_core_health_fw_fatal_dump_fw_assert(mfde_pl, fmsg);
case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
return mlxsw_core_health_fw_fatal_dump_fatal_cause(mfde_pl,
fmsg);
}
return 0;

View File

@ -11318,7 +11318,7 @@ mlxsw_reg_mgpir_unpack(char *payload, u8 *num_of_devices,
* -----------------------------------
*/
#define MLXSW_REG_MFDE_ID 0x9200
#define MLXSW_REG_MFDE_LEN 0x18
#define MLXSW_REG_MFDE_LEN 0x30
MLXSW_REG_DEFINE(mfde, MLXSW_REG_MFDE_ID, MLXSW_REG_MFDE_LEN);
@ -11328,10 +11328,32 @@ MLXSW_REG_DEFINE(mfde, MLXSW_REG_MFDE_ID, MLXSW_REG_MFDE_LEN);
*/
MLXSW_ITEM32(reg, mfde, irisc_id, 0x00, 24, 8);
enum mlxsw_reg_mfde_severity {
/* Unrecoverable switch behavior */
MLXSW_REG_MFDE_SEVERITY_FATL = 2,
/* Unexpected state with possible systemic failure */
MLXSW_REG_MFDE_SEVERITY_NRML = 3,
/* Unexpected state without systemic failure */
MLXSW_REG_MFDE_SEVERITY_INTR = 5,
};
/* reg_mfde_severity
* The severity of the event.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, severity, 0x00, 16, 8);
enum mlxsw_reg_mfde_event_id {
/* CRspace timeout */
MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO = 1,
/* KVD insertion machine stopped */
MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP,
/* Triggered by MFGD.trigger_test */
MLXSW_REG_MFDE_EVENT_ID_TEST,
/* Triggered when firmware hits an assert */
MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT,
/* Fatal error interrupt from hardware */
MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE,
};
/* reg_mfde_event_id
@ -11372,32 +11394,110 @@ MLXSW_ITEM32(reg, mfde, command_type, 0x04, 24, 2);
*/
MLXSW_ITEM32(reg, mfde, reg_attr_id, 0x04, 0, 16);
/* reg_mfde_log_address
/* reg_mfde_crspace_to_log_address
* crspace address accessed, which resulted in timeout.
* Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, log_address, 0x10, 0, 32);
MLXSW_ITEM32(reg, mfde, crspace_to_log_address, 0x10, 0, 32);
/* reg_mfde_log_id
/* reg_mfde_crspace_to_oe
* 0 - New event
* 1 - Old event, occurred before MFGD activation.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, crspace_to_oe, 0x14, 24, 1);
/* reg_mfde_crspace_to_log_id
* Which irisc triggered the timeout.
* Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, log_id, 0x14, 0, 4);
MLXSW_ITEM32(reg, mfde, crspace_to_log_id, 0x14, 0, 4);
/* reg_mfde_log_ip
/* reg_mfde_crspace_to_log_ip
* IP (instruction pointer) that triggered the timeout.
* Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO
* Access: RO
*/
MLXSW_ITEM64(reg, mfde, log_ip, 0x18, 0, 64);
MLXSW_ITEM64(reg, mfde, crspace_to_log_ip, 0x18, 0, 64);
/* reg_mfde_pipes_mask
/* reg_mfde_kvd_im_stop_oe
* 0 - New event
* 1 - Old event, occurred before MFGD activation.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, kvd_im_stop_oe, 0x10, 24, 1);
/* reg_mfde_kvd_im_stop_pipes_mask
* Bit per kvh pipe.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, pipes_mask, 0x10, 0, 16);
MLXSW_ITEM32(reg, mfde, kvd_im_stop_pipes_mask, 0x10, 0, 16);
/* reg_mfde_fw_assert_var0-4
* Variables passed to assert.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_var0, 0x10, 0, 32);
MLXSW_ITEM32(reg, mfde, fw_assert_var1, 0x14, 0, 32);
MLXSW_ITEM32(reg, mfde, fw_assert_var2, 0x18, 0, 32);
MLXSW_ITEM32(reg, mfde, fw_assert_var3, 0x1C, 0, 32);
MLXSW_ITEM32(reg, mfde, fw_assert_var4, 0x20, 0, 32);
/* reg_mfde_fw_assert_existptr
* The instruction pointer when assert was triggered.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_existptr, 0x24, 0, 32);
/* reg_mfde_fw_assert_callra
* The return address after triggering assert.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_callra, 0x28, 0, 32);
/* reg_mfde_fw_assert_oe
* 0 - New event
* 1 - Old event, occurred before MFGD activation.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_oe, 0x2C, 24, 1);
/* reg_mfde_fw_assert_tile_v
* 0: The assert was from main
* 1: The assert was from a tile
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_tile_v, 0x2C, 23, 1);
/* reg_mfde_fw_assert_tile_index
* When tile_v=1, the tile_index that caused the assert.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_tile_index, 0x2C, 16, 6);
/* reg_mfde_fw_assert_ext_synd
* A generated one-to-one identifier which is specific per-assert.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fw_assert_ext_synd, 0x2C, 0, 16);
/* reg_mfde_fatal_cause_id
* HW interrupt cause id.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fatal_cause_id, 0x10, 0, 18);
/* reg_mfde_fatal_cause_tile_v
* 0: The assert was from main
* 1: The assert was from a tile
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fatal_cause_tile_v, 0x14, 23, 1);
/* reg_mfde_fatal_cause_tile_index
* When tile_v=1, the tile_index that caused the assert.
* Access: RO
*/
MLXSW_ITEM32(reg, mfde, fatal_cause_tile_index, 0x14, 16, 6);
/* TNGCR - Tunneling NVE General Configuration Register
* ----------------------------------------------------