diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst index 196a4bb28df1..702f204a3dbd 100644 --- a/Documentation/networking/devlink/mlx5.rst +++ b/Documentation/networking/devlink/mlx5.rst @@ -135,7 +135,7 @@ Health reporters tx reporter ----------- -The tx reporter is responsible for reporting and recovering of the following two error scenarios: +The tx reporter is responsible for reporting and recovering of the following three error scenarios: - tx timeout Report on kernel tx timeout detection. @@ -143,6 +143,9 @@ The tx reporter is responsible for reporting and recovering of the following two - tx error completion Report on error tx completion. Recover by flushing the tx queue and reset it. +- tx PTP port timestamping CQ unhealthy + Report too many CQEs never delivered on port ts CQ. + Recover by flushing and re-creating all PTP channels. tx reporter also support on demand diagnose callback, on which it provides real time information of its send queues status. diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h index 0107e4e73bb0..415840c3ef84 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/health.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/health.h @@ -18,6 +18,7 @@ void mlx5e_reporter_tx_create(struct mlx5e_priv *priv); void mlx5e_reporter_tx_destroy(struct mlx5e_priv *priv); void mlx5e_reporter_tx_err_cqe(struct mlx5e_txqsq *sq); int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq); +void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq); int mlx5e_health_cq_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); int mlx5e_health_cq_common_diag_fmsg(struct mlx5e_cq *cq, struct devlink_fmsg *fmsg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c index 8680d21f3e7b..bb11e644d24f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c @@ -2,6 +2,7 @@ // Copyright (c) 2020 Mellanox Technologies #include "en/ptp.h" +#include "en/health.h" #include "en/txrx.h" #include "en/params.h" #include "en/fs_tt_redirect.h" @@ -140,6 +141,12 @@ mlx5e_ptp_metadata_map_remove(struct mlx5e_ptp_metadata_map *map, u16 metadata) return skb; } +static bool mlx5e_ptp_metadata_map_unhealthy(struct mlx5e_ptp_metadata_map *map) +{ + /* Considered beginning unhealthy state if size * 15 / 2^4 cannot be reclaimed. */ + return map->undelivered_counter > (map->capacity >> 4) * 15; +} + static void mlx5e_ptpsq_mark_ts_cqes_undelivered(struct mlx5e_ptpsq *ptpsq, ktime_t port_tstamp) { @@ -205,6 +212,9 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, out: napi_consume_skb(skb, budget); mlx5e_ptp_metadata_fifo_push(&ptpsq->metadata_freelist, metadata_id); + if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && + !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); } static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget) @@ -422,6 +432,14 @@ static void mlx5e_ptp_free_traffic_db(struct mlx5e_ptpsq *ptpsq) kvfree(ptpsq->ts_cqe_pending_list); } +static void mlx5e_ptpsq_unhealthy_work(struct work_struct *work) +{ + struct mlx5e_ptpsq *ptpsq = + container_of(work, struct mlx5e_ptpsq, report_unhealthy_work); + + mlx5e_reporter_tx_ptpsq_unhealthy(ptpsq); +} + static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, int txq_ix, struct mlx5e_ptp_params *cparams, int tc, struct mlx5e_ptpsq *ptpsq) @@ -451,6 +469,8 @@ static int mlx5e_ptp_open_txqsq(struct mlx5e_ptp *c, u32 tisn, if (err) goto err_free_txqsq; + INIT_WORK(&ptpsq->report_unhealthy_work, mlx5e_ptpsq_unhealthy_work); + return 0; err_free_txqsq: @@ -464,6 +484,8 @@ static void mlx5e_ptp_close_txqsq(struct mlx5e_ptpsq *ptpsq) struct mlx5e_txqsq *sq = &ptpsq->txqsq; struct mlx5_core_dev *mdev = sq->mdev; + if (current_work() != &ptpsq->report_unhealthy_work) + cancel_work_sync(&ptpsq->report_unhealthy_work); mlx5e_ptp_free_traffic_db(ptpsq); cancel_work_sync(&sq->recover_work); mlx5e_ptp_destroy_sq(mdev, sq->sqn); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h index 7c5597d4589d..7b700d0f956a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.h @@ -10,6 +10,7 @@ #include #include #include +#include #define MLX5E_PTP_CHANNEL_IX 0 #define MLX5E_PTP_MAX_LOG_SQ_SIZE (8U) @@ -34,6 +35,7 @@ struct mlx5e_ptpsq { struct mlx5e_ptp_cq_stats *cq_stats; u16 ts_cqe_ctr_mask; + struct work_struct report_unhealthy_work; struct mlx5e_ptp_port_ts_cqe_list *ts_cqe_pending_list; struct mlx5e_ptp_metadata_fifo metadata_freelist; struct mlx5e_ptp_metadata_map metadata_map; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c index b35ff289af49..ff8242f67c54 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c @@ -164,6 +164,43 @@ static int mlx5e_tx_reporter_timeout_recover(void *ctx) return err; } +static int mlx5e_tx_reporter_ptpsq_unhealthy_recover(void *ctx) +{ + struct mlx5e_ptpsq *ptpsq = ctx; + struct mlx5e_channels *chs; + struct net_device *netdev; + struct mlx5e_priv *priv; + int carrier_ok; + int err; + + if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &ptpsq->txqsq.state)) + return 0; + + priv = ptpsq->txqsq.priv; + + mutex_lock(&priv->state_lock); + chs = &priv->channels; + netdev = priv->netdev; + + carrier_ok = netif_carrier_ok(netdev); + netif_carrier_off(netdev); + + mlx5e_deactivate_priv_channels(priv); + + mlx5e_ptp_close(chs->ptp); + err = mlx5e_ptp_open(priv, &chs->params, chs->c[0]->lag_port, &chs->ptp); + + mlx5e_activate_priv_channels(priv); + + /* return carrier back if needed */ + if (carrier_ok) + netif_carrier_on(netdev); + + mutex_unlock(&priv->state_lock); + + return err; +} + /* state lock cannot be grabbed within this function. * It can cause a dead lock or a read-after-free. */ @@ -516,6 +553,15 @@ static int mlx5e_tx_reporter_timeout_dump(struct mlx5e_priv *priv, struct devlin return mlx5e_tx_reporter_dump_sq(priv, fmsg, to_ctx->sq); } +static int mlx5e_tx_reporter_ptpsq_unhealthy_dump(struct mlx5e_priv *priv, + struct devlink_fmsg *fmsg, + void *ctx) +{ + struct mlx5e_ptpsq *ptpsq = ctx; + + return mlx5e_tx_reporter_dump_sq(priv, fmsg, &ptpsq->txqsq); +} + static int mlx5e_tx_reporter_dump_all_sqs(struct mlx5e_priv *priv, struct devlink_fmsg *fmsg) { @@ -621,6 +667,25 @@ int mlx5e_reporter_tx_timeout(struct mlx5e_txqsq *sq) return to_ctx.status; } +void mlx5e_reporter_tx_ptpsq_unhealthy(struct mlx5e_ptpsq *ptpsq) +{ + struct mlx5e_ptp_metadata_map *map = &ptpsq->metadata_map; + char err_str[MLX5E_REPORTER_PER_Q_MAX_LEN]; + struct mlx5e_txqsq *txqsq = &ptpsq->txqsq; + struct mlx5e_cq *ts_cq = &ptpsq->ts_cq; + struct mlx5e_priv *priv = txqsq->priv; + struct mlx5e_err_ctx err_ctx = {}; + + err_ctx.ctx = ptpsq; + err_ctx.recover = mlx5e_tx_reporter_ptpsq_unhealthy_recover; + err_ctx.dump = mlx5e_tx_reporter_ptpsq_unhealthy_dump; + snprintf(err_str, sizeof(err_str), + "Unhealthy TX port TS queue: %d, SQ: 0x%x, CQ: 0x%x, Undelivered CQEs: %u Map Capacity: %u", + txqsq->ch_ix, txqsq->sqn, ts_cq->mcq.cqn, map->undelivered_counter, map->capacity); + + mlx5e_health_report(priv, priv->tx_reporter, err_str, &err_ctx); +} + static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { .name = "tx", .recover = mlx5e_tx_reporter_recover,