diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1251d78ffd46..b4d9374548f8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2138,10 +2138,12 @@ static int bnxt_async_event_process(struct bnxt *bp, set_bit(BNXT_STATE_FW_ACTIVATE_RESET, &bp->state); } else if (EVENT_DATA1_RESET_NOTIFY_FATAL(data1)) { type_str = "Fatal"; + bp->fw_health->fatalities++; set_bit(BNXT_STATE_FW_FATAL_COND, &bp->state); } else if (data2 && BNXT_FW_STATUS_HEALTHY != EVENT_DATA2_RESET_NOTIFY_FW_STATUS_CODE(data2)) { type_str = "Non-fatal"; + bp->fw_health->survivals++; set_bit(BNXT_STATE_FW_NON_FATAL_COND, &bp->state); } netif_warn(bp, hw, bp->dev, @@ -7604,6 +7606,7 @@ static int __bnxt_alloc_fw_health(struct bnxt *bp) if (!bp->fw_health) return -ENOMEM; + mutex_init(&bp->fw_health->lock); return 0; } @@ -7650,12 +7653,16 @@ static void bnxt_inv_fw_health_reg(struct bnxt *bp) struct bnxt_fw_health *fw_health = bp->fw_health; u32 reg_type; - if (!fw_health || !fw_health->status_reliable) + if (!fw_health) return; reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_HEALTH_REG]); if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC) fw_health->status_reliable = false; + + reg_type = BNXT_FW_HEALTH_REG_TYPE(fw_health->regs[BNXT_FW_RESET_CNT_REG]); + if (reg_type == BNXT_FW_HEALTH_REG_TYPE_GRC) + fw_health->resets_reliable = false; } static void bnxt_try_map_fw_health_reg(struct bnxt *bp) @@ -7712,6 +7719,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp) int i; bp->fw_health->status_reliable = false; + bp->fw_health->resets_reliable = false; /* Only pre-map the monitoring GRC registers using window 3 */ for (i = 0; i < 4; i++) { u32 reg = fw_health->regs[i]; @@ -7725,6 +7733,7 @@ static int bnxt_map_fw_health_regs(struct bnxt *bp) fw_health->mapped_regs[i] = BNXT_FW_HEALTH_WIN_OFF(reg); } bp->fw_health->status_reliable = true; + bp->fw_health->resets_reliable = true; if (reg_base == 0xffffffff) return 0; @@ -11264,14 +11273,18 @@ static void bnxt_fw_health_check(struct bnxt *bp) } val = bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); - if (val == fw_health->last_fw_heartbeat) + if (val == fw_health->last_fw_heartbeat) { + fw_health->arrests++; goto fw_reset; + } fw_health->last_fw_heartbeat = val; val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); - if (val != fw_health->last_fw_reset_cnt) + if (val != fw_health->last_fw_reset_cnt) { + fw_health->discoveries++; goto fw_reset; + } fw_health->tmr_counter = fw_health->tmr_multiplier; return; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index 2873f600a7dd..bbbc63e882d1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -1523,6 +1523,21 @@ struct bnxt_ctx_mem_info { struct bnxt_mem_init mem_init[BNXT_CTX_MEM_INIT_MAX]; }; +enum bnxt_health_severity { + SEVERITY_NORMAL = 0, + SEVERITY_WARNING, + SEVERITY_RECOVERABLE, + SEVERITY_FATAL, +}; + +enum bnxt_health_remedy { + REMEDY_DEVLINK_RECOVER, + REMEDY_POWER_CYCLE_DEVICE, + REMEDY_POWER_CYCLE_HOST, + REMEDY_FW_UPDATE, + REMEDY_HW_REPLACE, +}; + struct bnxt_fw_health { u32 flags; u32 polling_dsecs; @@ -1542,6 +1557,7 @@ struct bnxt_fw_health { u8 enabled:1; u8 primary:1; u8 status_reliable:1; + u8 resets_reliable:1; u8 tmr_multiplier; u8 tmr_counter; u8 fw_reset_seq_cnt; @@ -1551,6 +1567,15 @@ struct bnxt_fw_health { u32 echo_req_data1; u32 echo_req_data2; struct devlink_health_reporter *fw_reporter; + /* Protects severity and remedy */ + struct mutex lock; + enum bnxt_health_severity severity; + enum bnxt_health_remedy remedy; + u32 arrests; + u32 discoveries; + u32 survivals; + u32 fatalities; + u32 diagnoses; }; #define BNXT_FW_HEALTH_REG_TYPE_MASK 3 diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index a295d2042b6e..930cbf1ca4e0 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -71,43 +71,110 @@ static int bnxt_hwrm_remote_dev_reset_set(struct bnxt *bp, bool remote_reset) return hwrm_req_send(bp, req); } +static char *bnxt_health_severity_str(enum bnxt_health_severity severity) +{ + switch (severity) { + case SEVERITY_NORMAL: return "normal"; + case SEVERITY_WARNING: return "warning"; + case SEVERITY_RECOVERABLE: return "recoverable"; + case SEVERITY_FATAL: return "fatal"; + default: return "unknown"; + } +} + +static char *bnxt_health_remedy_str(enum bnxt_health_remedy remedy) +{ + switch (remedy) { + case REMEDY_DEVLINK_RECOVER: return "devlink recover"; + case REMEDY_POWER_CYCLE_DEVICE: return "device power cycle"; + case REMEDY_POWER_CYCLE_HOST: return "host power cycle"; + case REMEDY_FW_UPDATE: return "update firmware"; + case REMEDY_HW_REPLACE: return "replace hardware"; + default: return "unknown"; + } +} + static int bnxt_fw_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, struct netlink_ext_ack *extack) { struct bnxt *bp = devlink_health_reporter_priv(reporter); - u32 val; + struct bnxt_fw_health *h = bp->fw_health; + u32 fw_status, fw_resets; int rc; if (test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) - return 0; + return devlink_fmsg_string_pair_put(fmsg, "Status", "recovering"); - val = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); + if (!h->status_reliable) + return devlink_fmsg_string_pair_put(fmsg, "Status", "unknown"); - if (BNXT_FW_IS_BOOTING(val)) { - rc = devlink_fmsg_string_pair_put(fmsg, "Description", - "Not yet completed initialization"); + mutex_lock(&h->lock); + fw_status = bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG); + if (BNXT_FW_IS_BOOTING(fw_status)) { + rc = devlink_fmsg_string_pair_put(fmsg, "Status", "initializing"); if (rc) - return rc; - } else if (BNXT_FW_IS_ERR(val)) { - rc = devlink_fmsg_string_pair_put(fmsg, "Description", - "Encountered fatal error and cannot recover"); + goto unlock; + } else if (h->severity || fw_status != BNXT_FW_STATUS_HEALTHY) { + if (!h->severity) { + h->severity = SEVERITY_FATAL; + h->remedy = REMEDY_POWER_CYCLE_DEVICE; + h->diagnoses++; + devlink_health_report(h->fw_reporter, + "FW error diagnosed", h); + } + rc = devlink_fmsg_string_pair_put(fmsg, "Status", "error"); if (rc) - return rc; + goto unlock; + rc = devlink_fmsg_u32_pair_put(fmsg, "Syndrome", fw_status); + if (rc) + goto unlock; + } else { + rc = devlink_fmsg_string_pair_put(fmsg, "Status", "healthy"); + if (rc) + goto unlock; } - if (val >> 16) { - rc = devlink_fmsg_u32_pair_put(fmsg, "Error code", val >> 16); - if (rc) - return rc; - } - - val = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); - rc = devlink_fmsg_u32_pair_put(fmsg, "Reset count", val); + rc = devlink_fmsg_string_pair_put(fmsg, "Severity", + bnxt_health_severity_str(h->severity)); if (rc) + goto unlock; + + if (h->severity) { + rc = devlink_fmsg_string_pair_put(fmsg, "Remedy", + bnxt_health_remedy_str(h->remedy)); + if (rc) + goto unlock; + if (h->remedy == REMEDY_DEVLINK_RECOVER) { + rc = devlink_fmsg_string_pair_put(fmsg, "Impact", + "traffic+ntuple_cfg"); + if (rc) + goto unlock; + } + } + +unlock: + mutex_unlock(&h->lock); + if (rc || !h->resets_reliable) return rc; - return 0; + fw_resets = bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + rc = devlink_fmsg_u32_pair_put(fmsg, "Resets", fw_resets); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "Arrests", h->arrests); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "Survivals", h->survivals); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "Discoveries", h->discoveries); + if (rc) + return rc; + rc = devlink_fmsg_u32_pair_put(fmsg, "Fatalities", h->fatalities); + if (rc) + return rc; + return devlink_fmsg_u32_pair_put(fmsg, "Diagnoses", h->diagnoses); } static int bnxt_fw_recover(struct devlink_health_reporter *reporter, @@ -116,6 +183,9 @@ static int bnxt_fw_recover(struct devlink_health_reporter *reporter, { struct bnxt *bp = devlink_health_reporter_priv(reporter); + if (bp->fw_health->severity == SEVERITY_FATAL) + return -ENODEV; + set_bit(BNXT_STATE_RECOVER, &bp->state); __bnxt_fw_recover(bp); @@ -165,6 +235,7 @@ void bnxt_dl_fw_reporters_destroy(struct bnxt *bp, bool all) void bnxt_devlink_health_fw_report(struct bnxt *bp) { struct bnxt_fw_health *fw_health = bp->fw_health; + int rc; if (!fw_health) return; @@ -174,20 +245,32 @@ void bnxt_devlink_health_fw_report(struct bnxt *bp) return; } - devlink_health_report(fw_health->fw_reporter, "FW error reported", NULL); + mutex_lock(&fw_health->lock); + fw_health->severity = SEVERITY_RECOVERABLE; + fw_health->remedy = REMEDY_DEVLINK_RECOVER; + mutex_unlock(&fw_health->lock); + rc = devlink_health_report(fw_health->fw_reporter, "FW error reported", + fw_health); + if (rc == -ECANCELED) + __bnxt_fw_recover(bp); } void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy) { - struct bnxt_fw_health *health = bp->fw_health; + struct bnxt_fw_health *fw_health = bp->fw_health; u8 state; - if (healthy) + mutex_lock(&fw_health->lock); + if (healthy) { + fw_health->severity = SEVERITY_NORMAL; state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - else + } else { + fw_health->severity = SEVERITY_FATAL; + fw_health->remedy = REMEDY_POWER_CYCLE_DEVICE; state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - - devlink_health_reporter_state_update(health->fw_reporter, state); + } + mutex_unlock(&fw_health->lock); + devlink_health_reporter_state_update(fw_health->fw_reporter, state); } void bnxt_dl_health_fw_recovery_done(struct bnxt *bp)