mlxsw: core: Extend devlink health reporter with new events and parameters

Extend the devlink health reporter registered by mlxsw to report new
health events and their related parameters. These are meant to aid in
debugging of hardware / firmware issues.

Beside the test event ('MLXSW_REG_MFDE_EVENT_ID_TEST') that is triggered
following the devlink health 'test' sub-command, the new events are used
to report the triggering of asserts in firmware code
('MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT') and hardware issues
('MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE').

Each event is accompanied with a severity parameter and per-event
parameters that are meant to help root cause the detected issue.

Signed-off-by: Danielle Ratson <danieller@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Danielle Ratson 2021-12-20 12:56:14 +02:00 committed by David S. Miller
parent e25c060c5f
commit 239cdd3f4c

View file

@ -1708,12 +1708,93 @@ static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg,
static const struct mlxsw_listener mlxsw_core_health_listener =
MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE);
static int
mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val, tile_v;
int err;
val = mlxsw_reg_mfde_fatal_cause_id_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "cause_id", val);
if (err)
return err;
tile_v = mlxsw_reg_mfde_fatal_cause_tile_v_get(mfde_pl);
if (tile_v) {
val = mlxsw_reg_mfde_fatal_cause_tile_index_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
if (err)
return err;
}
return 0;
}
static int
mlxsw_core_health_fw_fatal_dump_fw_assert(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val, tile_v;
int err;
val = mlxsw_reg_mfde_fw_assert_var0_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var0", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var1_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var1", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var2_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var2", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var3_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var3", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_var4_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "var4", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_existptr_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "existptr", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_callra_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "callra", val);
if (err)
return err;
val = mlxsw_reg_mfde_fw_assert_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
tile_v = mlxsw_reg_mfde_fw_assert_tile_v_get(mfde_pl);
if (tile_v) {
val = mlxsw_reg_mfde_fw_assert_tile_index_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val);
if (err)
return err;
}
val = mlxsw_reg_mfde_fw_assert_ext_synd_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", val);
if (err)
return err;
return 0;
}
static int
mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl,
struct devlink_fmsg *fmsg)
{
u32 val;
int err;
val = mlxsw_reg_mfde_kvd_im_stop_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl);
return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val);
}
@ -1727,6 +1808,10 @@ mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl,
val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl);
err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val);
if (err)
return err;
val = mlxsw_reg_mfde_crspace_to_oe_get(mfde_pl);
err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val);
if (err)
return err;
val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl);
@ -1774,6 +1859,15 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
val_str = "KVD insertion machine stopped";
break;
case MLXSW_REG_MFDE_EVENT_ID_TEST:
val_str = "Test";
break;
case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
val_str = "FW assert";
break;
case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
val_str = "Fatal cause";
break;
default:
val_str = NULL;
}
@ -1782,6 +1876,38 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
if (err)
return err;
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
err = devlink_fmsg_arr_pair_nest_start(fmsg, "severity");
if (err)
return err;
val = mlxsw_reg_mfde_severity_get(mfde_pl);
err = devlink_fmsg_u8_pair_put(fmsg, "id", val);
if (err)
return err;
switch (val) {
case MLXSW_REG_MFDE_SEVERITY_FATL:
val_str = "Fatal";
break;
case MLXSW_REG_MFDE_SEVERITY_NRML:
val_str = "Normal";
break;
case MLXSW_REG_MFDE_SEVERITY_INTR:
val_str = "Debug";
break;
default:
val_str = NULL;
}
if (val_str) {
err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str);
if (err)
return err;
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
@ -1840,6 +1966,11 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor
case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP:
return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl,
fmsg);
case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT:
return mlxsw_core_health_fw_fatal_dump_fw_assert(mfde_pl, fmsg);
case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE:
return mlxsw_core_health_fw_fatal_dump_fatal_cause(mfde_pl,
fmsg);
}
return 0;