net/mlx5: Fix sync reset event handler error flow

When sync reset now event handling fails on mlx5_pci_link_toggle() then
no reset was done. However, since mlx5_cmd_fast_teardown_hca() was
already done, the firmware function is closed and the driver is left
without firmware functionality.

Fix it by setting device error state and reopen the firmware resources.
Reopening is done by the thread that was called for devlink reload
fw_activate as it already holds the devlink lock.

Fixes: 5ec697446f ("net/mlx5: Add support for devlink reload action fw activate")
Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Reviewed-by: Aya Levin <ayal@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
Moshe Shemesh 2022-10-29 09:03:48 +03:00 committed by Saeed Mahameed
parent 6d942e4044
commit e1ad07b922
1 changed files with 7 additions and 2 deletions

View File

@ -9,7 +9,8 @@ enum {
MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
MLX5_FW_RESET_FLAGS_PENDING_COMP,
MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS
MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS,
MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED
};
struct mlx5_fw_reset {
@ -406,7 +407,7 @@ static void mlx5_sync_reset_now_event(struct work_struct *work)
err = mlx5_pci_link_toggle(dev);
if (err) {
mlx5_core_warn(dev, "mlx5_pci_link_toggle failed, no reset done, err %d\n", err);
goto done;
set_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags);
}
mlx5_enter_error_state(dev, true);
@ -482,6 +483,10 @@ int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev)
goto out;
}
err = fw_reset->ret;
if (test_and_clear_bit(MLX5_FW_RESET_FLAGS_RELOAD_REQUIRED, &fw_reset->reset_flags)) {
mlx5_unload_one_devl_locked(dev);
mlx5_load_one_devl_locked(dev, false);
}
out:
clear_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags);
return err;