From 3f26a315f88274361aea86c2bcd6af4bb59b1a87 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 24 Nov 2022 13:05:53 +0100 Subject: [PATCH 01/12] net/mlx5e: Fix trap event handling Current code does not return correct return value from event handler. Fix it by returning NOTIFY_* and propagate err over newly introduce ctx structure. Signed-off-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/devlink.c | 24 +++++++++---------- .../net/ethernet/mellanox/mlx5/core/devlink.h | 5 ++++ .../net/ethernet/mellanox/mlx5/core/en_main.c | 12 ++++++---- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index 5bd83c0275f8..c06baa317fbd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -263,9 +263,10 @@ static int mlx5_devlink_trap_action_set(struct devlink *devlink, struct netlink_ext_ack *extack) { struct mlx5_core_dev *dev = devlink_priv(devlink); + struct mlx5_devlink_trap_event_ctx trap_event_ctx; enum devlink_trap_action action_orig; struct mlx5_devlink_trap *dl_trap; - int err = 0; + int err; if (is_mdev_switchdev_mode(dev)) { NL_SET_ERR_MSG_MOD(extack, "Devlink traps can't be set in switchdev mode"); @@ -275,26 +276,25 @@ static int mlx5_devlink_trap_action_set(struct devlink *devlink, dl_trap = mlx5_find_trap_by_id(dev, trap->id); if (!dl_trap) { mlx5_core_err(dev, "Devlink trap: Set action on invalid trap id 0x%x", trap->id); - err = -EINVAL; - goto out; + return -EINVAL; } - if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) { - err = -EOPNOTSUPP; - goto out; - } + if (action != DEVLINK_TRAP_ACTION_DROP && action != DEVLINK_TRAP_ACTION_TRAP) + return -EOPNOTSUPP; if (action == dl_trap->trap.action) - goto out; + return 0; action_orig = dl_trap->trap.action; dl_trap->trap.action = action; + trap_event_ctx.trap = &dl_trap->trap; + trap_event_ctx.err = 0; err = mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_TYPE_TRAP, - &dl_trap->trap); - if (err) + &trap_event_ctx); + if (err == NOTIFY_BAD) dl_trap->trap.action = action_orig; -out: - return err; + + return trap_event_ctx.err; } static const struct devlink_ops mlx5_devlink_ops = { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h index fd033df24856..b84cb70eb3ae 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h @@ -24,6 +24,11 @@ struct mlx5_devlink_trap { struct list_head list; }; +struct mlx5_devlink_trap_event_ctx { + struct mlx5_trap_ctx *trap; + int err; +}; + struct mlx5_core_dev; void mlx5_devlink_trap_report(struct mlx5_core_dev *dev, int trap_id, struct sk_buff *skb, struct devlink_port *dl_port); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index cff5f2e29e1e..7471c4e8004c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -179,17 +179,21 @@ static void mlx5e_disable_async_events(struct mlx5e_priv *priv) static int blocking_event(struct notifier_block *nb, unsigned long event, void *data) { struct mlx5e_priv *priv = container_of(nb, struct mlx5e_priv, blocking_events_nb); + struct mlx5_devlink_trap_event_ctx *trap_event_ctx = data; int err; switch (event) { case MLX5_DRIVER_EVENT_TYPE_TRAP: - err = mlx5e_handle_trap_event(priv, data); + err = mlx5e_handle_trap_event(priv, trap_event_ctx->trap); + if (err) { + trap_event_ctx->err = err; + return NOTIFY_BAD; + } break; default: - netdev_warn(priv->netdev, "Sync event: Unknown event %ld\n", event); - err = -EINVAL; + return NOTIFY_DONE; } - return err; + return NOTIFY_OK; } static void mlx5e_enable_blocking_events(struct mlx5e_priv *priv) From c7d4e6ab3165693342c21f6faf80d983137fee0c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 1 Nov 2022 15:27:43 +0100 Subject: [PATCH 02/12] net/mlx5e: Propagate an internal event in case uplink netdev changes Whenever uplink netdev is set/cleared, propagate newly introduced event to inform notifier blocks netdev was added/removed. Move the set() helper to core.c from header, introduce clear() and netdev_added_event_replay() helpers. The last one is going to be called from rdma driver, so export it. Signed-off-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++- .../ethernet/mellanox/mlx5/core/lib/mlx5.h | 5 ----- .../net/ethernet/mellanox/mlx5/core/main.c | 20 +++++++++++++++++++ include/linux/mlx5/device.h | 1 + include/linux/mlx5/driver.h | 5 +++++ 5 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 7471c4e8004c..85b51039d2a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5961,7 +5961,7 @@ static int mlx5e_probe(struct auxiliary_device *adev, } mlx5e_dcbnl_init_app(priv); - mlx5_uplink_netdev_set(mdev, netdev); + mlx5_core_uplink_netdev_set(mdev, netdev); mlx5e_params_print_info(mdev, &priv->channels.params); return 0; @@ -5981,6 +5981,7 @@ static void mlx5e_remove(struct auxiliary_device *adev) struct mlx5e_priv *priv = auxiliary_get_drvdata(adev); pm_message_t state = {}; + mlx5_core_uplink_netdev_set(priv->mdev, NULL); mlx5e_dcbnl_delete_app(priv); unregister_netdev(priv->netdev); mlx5e_suspend(adev, state); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index 032adb21ad4b..bfd3a1121ed8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -96,11 +96,6 @@ static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) return devlink_net(priv_to_devlink(dev)); } -static inline void mlx5_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev) -{ - mdev->mlx5e_res.uplink_netdev = netdev; -} - static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { return mdev->mlx5e_res.uplink_netdev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index df134f6d32dc..72716d1f8b37 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -336,6 +336,24 @@ static u16 to_fw_pkey_sz(struct mlx5_core_dev *dev, u32 size) } } +void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *dev, struct net_device *netdev) +{ + mutex_lock(&dev->mlx5e_res.uplink_netdev_lock); + dev->mlx5e_res.uplink_netdev = netdev; + mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_UPLINK_NETDEV, + netdev); + mutex_unlock(&dev->mlx5e_res.uplink_netdev_lock); +} + +void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *dev) +{ + mutex_lock(&dev->mlx5e_res.uplink_netdev_lock); + mlx5_blocking_notifier_call_chain(dev, MLX5_DRIVER_EVENT_UPLINK_NETDEV, + dev->mlx5e_res.uplink_netdev); + mutex_unlock(&dev->mlx5e_res.uplink_netdev_lock); +} +EXPORT_SYMBOL(mlx5_core_uplink_netdev_event_replay); + static int mlx5_core_get_caps_mode(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type, enum mlx5_cap_mode cap_mode) @@ -1608,6 +1626,7 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) lockdep_register_key(&dev->lock_key); mutex_init(&dev->intf_state_mutex); lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key); + mutex_init(&dev->mlx5e_res.uplink_netdev_lock); mutex_init(&priv->bfregs.reg_head.lock); mutex_init(&priv->bfregs.wc_head.lock); @@ -1696,6 +1715,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) mutex_destroy(&priv->alloc_mutex); mutex_destroy(&priv->bfregs.wc_head.lock); mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->mlx5e_res.uplink_netdev_lock); mutex_destroy(&dev->intf_state_mutex); lockdep_unregister_key(&dev->lock_key); } diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 29d4b201c7b2..f2b271169daf 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -362,6 +362,7 @@ enum mlx5_event { enum mlx5_driver_event { MLX5_DRIVER_EVENT_TYPE_TRAP = 0, + MLX5_DRIVER_EVENT_UPLINK_NETDEV, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d476255c9a3f..cc48aa308269 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -674,6 +675,7 @@ struct mlx5e_resources { } hw_objs; struct devlink_port dl_port; struct net_device *uplink_netdev; + struct mutex uplink_netdev_lock; }; enum mlx5_sw_icm_type { @@ -1011,6 +1013,9 @@ int mlx5_cmd_exec_polling(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); bool mlx5_cmd_is_down(struct mlx5_core_dev *dev); +void mlx5_core_uplink_netdev_set(struct mlx5_core_dev *mdev, struct net_device *netdev); +void mlx5_core_uplink_netdev_event_replay(struct mlx5_core_dev *mdev); + int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type); void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); From dca55da0a15717dde509d17163946e951bad56c4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 1 Nov 2022 15:36:01 +0100 Subject: [PATCH 03/12] RDMA/mlx5: Track netdev to avoid deadlock during netdev notifier unregister When removing a network namespace with mlx5 devlink instance being in it, following callchain is performed: cleanup_net (takes down_read(&pernet_ops_rwsem) devlink_pernet_pre_exit() devlink_reload() mlx5_devlink_reload_down() mlx5_unload_one_devl_locked() mlx5_detach_device() del_adev() mlx5r_remove() __mlx5_ib_remove() mlx5_ib_roce_cleanup() mlx5_remove_netdev_notifier() unregister_netdevice_notifier (takes down_write(&pernet_ops_rwsem) This deadlocks. Resolve this by converting to register_netdevice_notifier_dev_net() which does not take pernet_ops_rwsem and moves the notifier block around according to netdev it takes as arg. Use previously introduced netdev added/removed events to track uplink netdev to be used for register_netdevice_notifier_dev_net() purposes. Signed-off-by: Jiri Pirko Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed --- drivers/infiniband/hw/mlx5/main.c | 80 +++++++++++++------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 + .../net/ethernet/mellanox/mlx5/core/events.c | 2 + 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c669ef6e47e7..dc32e4518a28 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3012,26 +3012,63 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) } } -static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +static void mlx5_netdev_notifier_register(struct mlx5_roce *roce, + struct net_device *netdev) { int err; - dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event; - err = register_netdevice_notifier(&dev->port[port_num].roce.nb); - if (err) { - dev->port[port_num].roce.nb.notifier_call = NULL; - return err; - } - - return 0; + if (roce->tracking_netdev) + return; + roce->tracking_netdev = netdev; + roce->nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier_dev_net(netdev, &roce->nb, &roce->nn); + WARN_ON(err); } -static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u32 port_num) +static void mlx5_netdev_notifier_unregister(struct mlx5_roce *roce) { - if (dev->port[port_num].roce.nb.notifier_call) { - unregister_netdevice_notifier(&dev->port[port_num].roce.nb); - dev->port[port_num].roce.nb.notifier_call = NULL; + if (!roce->tracking_netdev) + return; + unregister_netdevice_notifier_dev_net(roce->tracking_netdev, &roce->nb, + &roce->nn); + roce->tracking_netdev = NULL; +} + +static int mlx5e_mdev_notifier_event(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct mlx5_roce *roce = container_of(nb, struct mlx5_roce, mdev_nb); + struct net_device *netdev = data; + + switch (event) { + case MLX5_DRIVER_EVENT_UPLINK_NETDEV: + if (netdev) + mlx5_netdev_notifier_register(roce, netdev); + else + mlx5_netdev_notifier_unregister(roce); + break; + default: + return NOTIFY_DONE; } + + return NOTIFY_OK; +} + +static void mlx5_mdev_netdev_track(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + roce->mdev_nb.notifier_call = mlx5e_mdev_notifier_event; + mlx5_blocking_notifier_register(dev->mdev, &roce->mdev_nb); + mlx5_core_uplink_netdev_event_replay(dev->mdev); +} + +static void mlx5_mdev_netdev_untrack(struct mlx5_ib_dev *dev, u32 port_num) +{ + struct mlx5_roce *roce = &dev->port[port_num].roce; + + mlx5_blocking_notifier_unregister(dev->mdev, &roce->mdev_nb); + mlx5_netdev_notifier_unregister(roce); } static int mlx5_enable_eth(struct mlx5_ib_dev *dev) @@ -3138,7 +3175,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, if (mpi->mdev_events.notifier_call) mlx5_notifier_unregister(mpi->mdev, &mpi->mdev_events); mpi->mdev_events.notifier_call = NULL; - mlx5_remove_netdev_notifier(ibdev, port_num); + mlx5_mdev_netdev_untrack(ibdev, port_num); spin_lock(&port->mp.mpi_lock); comps = mpi->mdev_refcnt; @@ -3196,12 +3233,7 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, if (err) goto unbind; - err = mlx5_add_netdev_notifier(ibdev, port_num); - if (err) { - mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", - port_num + 1); - goto unbind; - } + mlx5_mdev_netdev_track(ibdev, port_num); mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port; mlx5_notifier_register(mpi->mdev, &mpi->mdev_events); @@ -3909,9 +3941,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) port_num = mlx5_core_native_port_num(dev->mdev) - 1; /* Register only for native ports */ - err = mlx5_add_netdev_notifier(dev, port_num); - if (err) - return err; + mlx5_mdev_netdev_track(dev, port_num); err = mlx5_enable_eth(dev); if (err) @@ -3920,7 +3950,7 @@ static int mlx5_ib_roce_init(struct mlx5_ib_dev *dev) return 0; cleanup: - mlx5_remove_netdev_notifier(dev, port_num); + mlx5_mdev_netdev_untrack(dev, port_num); return err; } @@ -3938,7 +3968,7 @@ static void mlx5_ib_roce_cleanup(struct mlx5_ib_dev *dev) mlx5_disable_eth(dev); port_num = mlx5_core_native_port_num(dev->mdev) - 1; - mlx5_remove_netdev_notifier(dev, port_num); + mlx5_mdev_netdev_untrack(dev, port_num); } } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8b91babdd4c0..7394e7f36ba7 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -832,6 +832,9 @@ struct mlx5_roce { rwlock_t netdev_lock; struct net_device *netdev; struct notifier_block nb; + struct netdev_net_notifier nn; + struct notifier_block mdev_nb; + struct net_device *tracking_netdev; atomic_t tx_port_affinity; enum ib_port_state last_port_state; struct mlx5_ib_dev *dev; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 9459e56ee90a..718cf09c28ce 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -424,6 +424,7 @@ int mlx5_blocking_notifier_register(struct mlx5_core_dev *dev, struct notifier_b return blocking_notifier_chain_register(&events->sw_nh, nb); } +EXPORT_SYMBOL(mlx5_blocking_notifier_register); int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier_block *nb) { @@ -431,6 +432,7 @@ int mlx5_blocking_notifier_unregister(struct mlx5_core_dev *dev, struct notifier return blocking_notifier_chain_unregister(&events->sw_nh, nb); } +EXPORT_SYMBOL(mlx5_blocking_notifier_unregister); int mlx5_blocking_notifier_call_chain(struct mlx5_core_dev *dev, unsigned int event, void *data) From 7368f221e09eac116050475bec5a292a85c5ea8a Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Wed, 4 Jan 2023 20:17:52 -0800 Subject: [PATCH 04/12] net/mlx5: Introduce new destination type TABLE_TYPE This new destination type supports flow transition between different table types, e.g. from NIC_RX to RDMA_RX or from RDMA_TX to NIC_TX. Signed-off-by: Patrisious Haddad Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2d17b6a6d82d..c3d3a2eef7d4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -315,6 +315,11 @@ enum { MLX5_CMD_OP_GENERAL_END = 0xd00, }; +enum { + MLX5_FT_NIC_RX_2_NIC_RX_RDMA = BIT(0), + MLX5_FT_NIC_TX_RDMA_2_NIC_TX = BIT(1), +}; + struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_dmac[0x1]; u8 outer_smac[0x1]; @@ -1903,7 +1908,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 reserved_at_e0[0xc0]; - u8 reserved_at_1a0[0xb]; + u8 flow_table_type_2_type[0x8]; + u8 reserved_at_1a8[0x3]; u8 log_min_mkey_entity_size[0x5]; u8 reserved_at_1b0[0x10]; @@ -1927,6 +1933,7 @@ enum mlx5_ifc_flow_destination_type { MLX5_IFC_FLOW_DESTINATION_TYPE_TIR = 0x2, MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER = 0x6, MLX5_IFC_FLOW_DESTINATION_TYPE_UPLINK = 0x8, + MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE = 0xA, }; enum mlx5_flow_table_miss_action { @@ -1941,7 +1948,8 @@ struct mlx5_ifc_dest_format_struct_bits { u8 destination_eswitch_owner_vhca_id_valid[0x1]; u8 packet_reformat[0x1]; - u8 reserved_at_22[0xe]; + u8 reserved_at_22[0x6]; + u8 destination_table_type[0x8]; u8 destination_eswitch_owner_vhca_id[0x10]; }; From 4f226b71f5edc33843e21f92ebe97b1561804d52 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 4 Jan 2023 20:17:53 -0800 Subject: [PATCH 05/12] net/mlx5: Implement new destination type TABLE_TYPE Implement new destination type to support flow transition between different table types. e.g. from NIC_RX to RDMA_RX or from RDMA_TX to NIC_TX. The new destination is described in the tracepoint as follows: "mlx5_fs_add_rule: rule=00000000d53cd0ed fte=0000000048a8a6ed index=0 sw_action=<> [dst] flow_table_type=7 id:262152" Signed-off-by: Mark Zhang Signed-off-by: Patrisious Haddad Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c | 4 ++++ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 6 ++++++ drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 3 ++- include/linux/mlx5/fs.h | 1 + 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c index 2732128e7a6e..6d73127b7217 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fs_tracepoint.c @@ -275,6 +275,10 @@ const char *parse_fs_dst(struct trace_seq *p, fs_dest_range_field_to_str(dst->range.field), dst->range.min, dst->range.max); break; + case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE: + trace_seq_printf(p, "flow_table_type=%u id:%u\n", dst->ft->type, + dst->ft->id); + break; case MLX5_FLOW_DESTINATION_TYPE_NONE: trace_seq_printf(p, "none\n"); break; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c index 32d4c967469c..a3a9cc6f15ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c @@ -653,6 +653,12 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, id = dst->dest_attr.sampler_id; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_FLOW_SAMPLER; break; + case MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE: + MLX5_SET(dest_format_struct, in_dests, + destination_table_type, dst->dest_attr.ft->type); + id = dst->dest_attr.ft->id; + ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TABLE_TYPE; + break; default: id = dst->dest_attr.tir_num; ifc_type = MLX5_IFC_FLOW_DESTINATION_TYPE_TIR; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 5a85d8c1e797..2333f835fb70 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -449,7 +449,8 @@ static bool is_fwd_dest_type(enum mlx5_flow_destination_type type) type == MLX5_FLOW_DESTINATION_TYPE_VPORT || type == MLX5_FLOW_DESTINATION_TYPE_FLOW_SAMPLER || type == MLX5_FLOW_DESTINATION_TYPE_TIR || - type == MLX5_FLOW_DESTINATION_TYPE_RANGE; + type == MLX5_FLOW_DESTINATION_TYPE_RANGE || + type == MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; } static bool check_valid_spec(const struct mlx5_flow_spec *spec) diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index ba6958b49a8e..1d2a0638ae74 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -51,6 +51,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_COUNTER, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, MLX5_FLOW_DESTINATION_TYPE_RANGE, + MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE, }; enum { From f91ddd3aa4b313aa4b792d1588ccb63c3e4ace0b Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 4 Jan 2023 20:17:54 -0800 Subject: [PATCH 06/12] net/mlx5: Add IPSec priorities in RDMA namespaces Add IPSec flow steering priorities in RDMA namespaces. This allows adding tables/rules to forward RoCEv2 traffic to the IPSec crypto tables in NIC_TX domain, and accept RoCEv2 traffic from NIC_RX domain. Signed-off-by: Mark Zhang Signed-off-by: Patrisious Haddad Reviewed-by: Maor Gottlieb Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 35 +++++++++++++++++-- include/linux/mlx5/fs.h | 2 ++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 2333f835fb70..eac9fd35129d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -219,19 +219,30 @@ static struct init_tree_node egress_root_fs = { }; enum { + RDMA_RX_IPSEC_PRIO, RDMA_RX_COUNTERS_PRIO, RDMA_RX_BYPASS_PRIO, RDMA_RX_KERNEL_PRIO, }; +#define RDMA_RX_IPSEC_NUM_PRIOS 1 +#define RDMA_RX_IPSEC_NUM_LEVELS 2 +#define RDMA_RX_IPSEC_MIN_LEVEL (RDMA_RX_IPSEC_NUM_LEVELS) + #define RDMA_RX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_REGULAR_PRIOS #define RDMA_RX_KERNEL_MIN_LEVEL (RDMA_RX_BYPASS_MIN_LEVEL + 1) #define RDMA_RX_COUNTERS_MIN_LEVEL (RDMA_RX_KERNEL_MIN_LEVEL + 2) static struct init_tree_node rdma_rx_root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 3, + .ar_size = 4, .children = (struct init_tree_node[]) { + [RDMA_RX_IPSEC_PRIO] = + ADD_PRIO(0, RDMA_RX_IPSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(RDMA_RX_IPSEC_NUM_PRIOS, + RDMA_RX_IPSEC_NUM_LEVELS))), [RDMA_RX_COUNTERS_PRIO] = ADD_PRIO(0, RDMA_RX_COUNTERS_MIN_LEVEL, 0, FS_CHAINING_CAPS, @@ -254,15 +265,20 @@ static struct init_tree_node rdma_rx_root_fs = { enum { RDMA_TX_COUNTERS_PRIO, + RDMA_TX_IPSEC_PRIO, RDMA_TX_BYPASS_PRIO, }; #define RDMA_TX_BYPASS_MIN_LEVEL MLX5_BY_PASS_NUM_PRIOS #define RDMA_TX_COUNTERS_MIN_LEVEL (RDMA_TX_BYPASS_MIN_LEVEL + 1) +#define RDMA_TX_IPSEC_NUM_PRIOS 1 +#define RDMA_TX_IPSEC_PRIO_NUM_LEVELS 1 +#define RDMA_TX_IPSEC_MIN_LEVEL (RDMA_TX_COUNTERS_MIN_LEVEL + RDMA_TX_IPSEC_NUM_PRIOS) + static struct init_tree_node rdma_tx_root_fs = { .type = FS_TYPE_NAMESPACE, - .ar_size = 2, + .ar_size = 3, .children = (struct init_tree_node[]) { [RDMA_TX_COUNTERS_PRIO] = ADD_PRIO(0, RDMA_TX_COUNTERS_MIN_LEVEL, 0, @@ -270,6 +286,13 @@ static struct init_tree_node rdma_tx_root_fs = { ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, ADD_MULTIPLE_PRIO(MLX5_RDMA_TX_NUM_COUNTERS_PRIOS, RDMA_TX_COUNTERS_PRIO_NUM_LEVELS))), + [RDMA_TX_IPSEC_PRIO] = + ADD_PRIO(0, RDMA_TX_IPSEC_MIN_LEVEL, 0, + FS_CHAINING_CAPS, + ADD_NS(MLX5_FLOW_TABLE_MISS_ACTION_DEF, + ADD_MULTIPLE_PRIO(RDMA_TX_IPSEC_NUM_PRIOS, + RDMA_TX_IPSEC_PRIO_NUM_LEVELS))), + [RDMA_TX_BYPASS_PRIO] = ADD_PRIO(0, RDMA_TX_BYPASS_MIN_LEVEL, 0, FS_CHAINING_CAPS_RDMA_TX, @@ -2368,6 +2391,14 @@ struct mlx5_flow_namespace *mlx5_get_flow_namespace(struct mlx5_core_dev *dev, root_ns = steering->rdma_tx_root_ns; prio = RDMA_TX_COUNTERS_PRIO; break; + case MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC: + root_ns = steering->rdma_rx_root_ns; + prio = RDMA_RX_IPSEC_PRIO; + break; + case MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC: + root_ns = steering->rdma_tx_root_ns; + prio = RDMA_TX_IPSEC_PRIO; + break; default: /* Must be NIC RX */ WARN_ON(!is_nic_rx_ns(type)); root_ns = steering->root_ns; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 1d2a0638ae74..d72a09a3798c 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -103,6 +103,8 @@ enum mlx5_flow_namespace_type { MLX5_FLOW_NAMESPACE_PORT_SEL, MLX5_FLOW_NAMESPACE_RDMA_RX_COUNTERS, MLX5_FLOW_NAMESPACE_RDMA_TX_COUNTERS, + MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC, + MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC, }; enum { From 899577600b25b338072095009d4c578fbd936177 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 4 Jan 2023 20:17:55 -0800 Subject: [PATCH 07/12] net/mlx5: Configure IPsec steering for ingress RoCEv2 traffic Add steering tables/rules to check if the decrypted traffic is RoCEv2, if so then forward it to RDMA_RX domain. Signed-off-by: Mark Zhang Signed-off-by: Patrisious Haddad Reviewed-by: Leon Romanovsky Reviewed-by: Raed Salem Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../net/ethernet/mellanox/mlx5/core/en/fs.h | 1 + .../mellanox/mlx5/core/en_accel/ipsec.h | 2 + .../mellanox/mlx5/core/en_accel/ipsec_fs.c | 40 ++- .../mlx5/core/en_accel/ipsec_offload.c | 4 + .../net/ethernet/mellanox/mlx5/core/fs_core.c | 6 +- .../mellanox/mlx5/core/lib/ipsec_fs_roce.c | 249 ++++++++++++++++++ .../mellanox/mlx5/core/lib/ipsec_fs_roce.h | 21 ++ 8 files changed, 317 insertions(+), 8 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index cd4a1ab0ea78..8415a44fb965 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -97,7 +97,7 @@ mlx5_core-$(CONFIG_MLX5_EN_MACSEC) += en_accel/macsec.o en_accel/macsec_fs.o \ mlx5_core-$(CONFIG_MLX5_EN_IPSEC) += en_accel/ipsec.o en_accel/ipsec_rxtx.o \ en_accel/ipsec_stats.o en_accel/ipsec_fs.o \ - en_accel/ipsec_offload.o + en_accel/ipsec_offload.o lib/ipsec_fs_roce.o mlx5_core-$(CONFIG_MLX5_EN_TLS) += en_accel/ktls_stats.o \ en_accel/fs_tcp.o en_accel/ktls.o en_accel/ktls_txrx.o \ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h index 379c6dc9a3be..d2149f0138d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h @@ -87,6 +87,7 @@ enum { MLX5E_ACCEL_FS_POL_FT_LEVEL = MLX5E_INNER_TTC_FT_LEVEL + 1, MLX5E_ACCEL_FS_ESP_FT_LEVEL, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, + MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, #endif }; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index a92e19c4c499..35992866f83b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -84,6 +84,7 @@ enum mlx5_ipsec_cap { MLX5_IPSEC_CAP_CRYPTO = 1 << 0, MLX5_IPSEC_CAP_ESN = 1 << 1, MLX5_IPSEC_CAP_PACKET_OFFLOAD = 1 << 2, + MLX5_IPSEC_CAP_ROCE = 1 << 3, }; struct mlx5e_priv; @@ -141,6 +142,7 @@ struct mlx5e_ipsec { struct mlx5e_ipsec_tx *tx; struct mlx5e_ipsec_aso *aso; struct notifier_block nb; + struct mlx5_ipsec_fs *roce; }; struct mlx5e_ipsec_esn_state { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 9f19f4b59a70..f0a6db19272e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -6,6 +6,7 @@ #include "en/fs.h" #include "ipsec.h" #include "fs_core.h" +#include "lib/ipsec_fs_roce.h" #define NUM_IPSEC_FTE BIT(15) @@ -166,7 +167,8 @@ static int ipsec_miss_create(struct mlx5_core_dev *mdev, return err; } -static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx) +static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, + struct mlx5e_ipsec_rx *rx, u32 family) { mlx5_del_flow_rules(rx->pol.rule); mlx5_destroy_flow_group(rx->pol.group); @@ -179,6 +181,8 @@ static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_rx *rx) mlx5_del_flow_rules(rx->status.rule); mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr); mlx5_destroy_flow_table(rx->ft.status); + + mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce, family); } static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, @@ -186,18 +190,35 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, { struct mlx5_flow_namespace *ns = mlx5e_fs_get_ns(ipsec->fs, false); struct mlx5_ttc_table *ttc = mlx5e_fs_get_ttc(ipsec->fs, false); + struct mlx5_flow_destination default_dest; struct mlx5_flow_destination dest[2]; struct mlx5_flow_table *ft; int err; + default_dest = mlx5_ttc_get_default_dest(ttc, family2tt(family)); + err = mlx5_ipsec_fs_roce_rx_create(mdev, ipsec->roce, ns, &default_dest, + family, MLX5E_ACCEL_FS_ESP_FT_ROCE_LEVEL, + MLX5E_NIC_PRIO); + if (err) + return err; + ft = ipsec_ft_create(ns, MLX5E_ACCEL_FS_ESP_FT_ERR_LEVEL, MLX5E_NIC_PRIO, 1); - if (IS_ERR(ft)) - return PTR_ERR(ft); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + goto err_fs_ft_status; + } rx->ft.status = ft; - dest[0] = mlx5_ttc_get_default_dest(ttc, family2tt(family)); + ft = mlx5_ipsec_fs_roce_ft_get(ipsec->roce, family); + if (ft) { + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = ft; + } else { + dest[0] = default_dest; + } + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; dest[1].counter_id = mlx5_fc_id(rx->fc->cnt); err = ipsec_status_rule(mdev, rx, dest); @@ -245,6 +266,8 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, mlx5_modify_header_dealloc(mdev, rx->status.modify_hdr); err_add: mlx5_destroy_flow_table(rx->ft.status); +err_fs_ft_status: + mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce, family); return err; } @@ -304,7 +327,7 @@ static void rx_ft_put(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, mlx5_ttc_fwd_default_dest(ttc, family2tt(family)); /* remove FT */ - rx_destroy(mdev, rx); + rx_destroy(mdev, ipsec, rx, family); out: mutex_unlock(&rx->ft.mutex); @@ -1008,6 +1031,9 @@ void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec) if (!ipsec->tx) return; + if (mlx5_ipsec_device_caps(ipsec->mdev) & MLX5_IPSEC_CAP_ROCE) + mlx5_ipsec_fs_roce_cleanup(ipsec->roce); + ipsec_fs_destroy_counters(ipsec); mutex_destroy(&ipsec->tx->ft.mutex); WARN_ON(ipsec->tx->ft.refcnt); @@ -1024,6 +1050,7 @@ void mlx5e_accel_ipsec_fs_cleanup(struct mlx5e_ipsec *ipsec) int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec) { + struct mlx5_core_dev *mdev = ipsec->mdev; struct mlx5_flow_namespace *ns; int err = -ENOMEM; @@ -1053,6 +1080,9 @@ int mlx5e_accel_ipsec_fs_init(struct mlx5e_ipsec *ipsec) mutex_init(&ipsec->rx_ipv6->ft.mutex); ipsec->tx->ns = ns; + if (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_ROCE) + ipsec->roce = mlx5_ipsec_fs_roce_init(mdev); + return 0; err_counters: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index 8e3614218fc4..436d939b95af 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -42,6 +42,10 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) MLX5_CAP_FLOWTABLE_NIC_RX(mdev, decap)) caps |= MLX5_IPSEC_CAP_PACKET_OFFLOAD; + if (mlx5_get_roce_state(mdev) && + (MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) & MLX5_FT_NIC_RX_2_NIC_RX_RDMA)) + caps |= MLX5_IPSEC_CAP_ROCE; + if (!caps) return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index eac9fd35129d..cb28cdb59c17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -111,8 +111,10 @@ #define ETHTOOL_PRIO_NUM_LEVELS 1 #define ETHTOOL_NUM_PRIOS 11 #define ETHTOOL_MIN_LEVEL (KERNEL_MIN_LEVEL + ETHTOOL_NUM_PRIOS) -/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy */ -#define KERNEL_NIC_PRIO_NUM_LEVELS 8 +/* Promiscuous, Vlan, mac, ttc, inner ttc, {UDP/ANY/aRFS/accel/{esp, esp_err}}, IPsec policy, + * IPsec RoCE policy + */ +#define KERNEL_NIC_PRIO_NUM_LEVELS 9 #define KERNEL_NIC_NUM_PRIOS 1 /* One more level for tc */ #define KERNEL_MIN_LEVEL (KERNEL_NIC_PRIO_NUM_LEVELS + 1) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c new file mode 100644 index 000000000000..cfbbf90f5937 --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c @@ -0,0 +1,249 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#include "fs_core.h" +#include "lib/ipsec_fs_roce.h" +#include "mlx5_core.h" + +struct mlx5_ipsec_miss { + struct mlx5_flow_group *group; + struct mlx5_flow_handle *rule; +}; + +struct mlx5_ipsec_rx_roce { + struct mlx5_flow_group *g; + struct mlx5_flow_table *ft; + struct mlx5_flow_handle *rule; + struct mlx5_ipsec_miss roce_miss; + + struct mlx5_flow_table *ft_rdma; + struct mlx5_flow_namespace *ns_rdma; +}; + +struct mlx5_ipsec_fs { + struct mlx5_ipsec_rx_roce ipv4_rx; + struct mlx5_ipsec_rx_roce ipv6_rx; +}; + +static void ipsec_fs_roce_setup_udp_dport(struct mlx5_flow_spec *spec, + u16 dport) +{ + spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS; + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.ip_protocol); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.ip_protocol, IPPROTO_UDP); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, outer_headers.udp_dport); + MLX5_SET(fte_match_param, spec->match_value, outer_headers.udp_dport, dport); +} + +static int +ipsec_fs_roce_rx_rule_setup(struct mlx5_core_dev *mdev, + struct mlx5_flow_destination *default_dst, + struct mlx5_ipsec_rx_roce *roce) +{ + struct mlx5_flow_destination dst = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + ipsec_fs_roce_setup_udp_dport(spec, ROCE_V2_UDP_DPORT); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + dst.ft = roce->ft_rdma; + rule = mlx5_add_flow_rules(roce->ft, spec, &flow_act, &dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Fail to add RX RoCE IPsec rule err=%d\n", + err); + goto fail_add_rule; + } + + roce->rule = rule; + + memset(spec, 0, sizeof(*spec)); + rule = mlx5_add_flow_rules(roce->ft, spec, &flow_act, default_dst, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Fail to add RX RoCE IPsec miss rule err=%d\n", + err); + goto fail_add_default_rule; + } + + roce->roce_miss.rule = rule; + + kvfree(spec); + return 0; + +fail_add_default_rule: + mlx5_del_flow_rules(roce->rule); +fail_add_rule: + kvfree(spec); + return err; +} + +struct mlx5_flow_table *mlx5_ipsec_fs_roce_ft_get(struct mlx5_ipsec_fs *ipsec_roce, u32 family) +{ + struct mlx5_ipsec_rx_roce *rx_roce; + + if (!ipsec_roce) + return NULL; + + rx_roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx : + &ipsec_roce->ipv6_rx; + + return rx_roce->ft; +} + +void mlx5_ipsec_fs_roce_rx_destroy(struct mlx5_ipsec_fs *ipsec_roce, u32 family) +{ + struct mlx5_ipsec_rx_roce *rx_roce; + + if (!ipsec_roce) + return; + + rx_roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx : + &ipsec_roce->ipv6_rx; + + mlx5_del_flow_rules(rx_roce->roce_miss.rule); + mlx5_del_flow_rules(rx_roce->rule); + mlx5_destroy_flow_table(rx_roce->ft_rdma); + mlx5_destroy_flow_group(rx_roce->roce_miss.group); + mlx5_destroy_flow_group(rx_roce->g); + mlx5_destroy_flow_table(rx_roce->ft); +} + +#define MLX5_RX_ROCE_GROUP_SIZE BIT(0) + +int mlx5_ipsec_fs_roce_rx_create(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_fs *ipsec_roce, + struct mlx5_flow_namespace *ns, + struct mlx5_flow_destination *default_dst, + u32 family, u32 level, u32 prio) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_ipsec_rx_roce *roce; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + void *outer_headers_c; + int ix = 0; + u32 *in; + int err; + u8 *mc; + + if (!ipsec_roce) + return 0; + + roce = (family == AF_INET) ? &ipsec_roce->ipv4_rx : + &ipsec_roce->ipv6_rx; + + ft_attr.max_fte = 2; + ft_attr.level = level; + ft_attr.prio = prio; + ft = mlx5_create_flow_table(ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + mlx5_core_err(mdev, "Fail to create RoCE IPsec rx ft at nic err=%d\n", err); + return err; + } + + roce->ft = ft; + + in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto fail_nomem; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); + outer_headers_c = MLX5_ADDR_OF(fte_match_param, mc, outer_headers); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol); + MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, outer_headers_c, udp_dport); + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_RX_ROCE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + g = mlx5_create_flow_group(ft, in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + mlx5_core_err(mdev, "Fail to create RoCE IPsec rx group at nic err=%d\n", err); + goto fail_group; + } + roce->g = g; + + memset(in, 0, MLX5_ST_SZ_BYTES(create_flow_group_in)); + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_RX_ROCE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + g = mlx5_create_flow_group(ft, in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + mlx5_core_err(mdev, "Fail to create RoCE IPsec rx miss group at nic err=%d\n", err); + goto fail_mgroup; + } + roce->roce_miss.group = g; + + memset(&ft_attr, 0, sizeof(ft_attr)); + if (family == AF_INET) + ft_attr.level = 1; + ft = mlx5_create_flow_table(roce->ns_rdma, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + mlx5_core_err(mdev, "Fail to create RoCE IPsec rx ft at rdma err=%d\n", err); + goto fail_rdma_table; + } + + roce->ft_rdma = ft; + + err = ipsec_fs_roce_rx_rule_setup(mdev, default_dst, roce); + if (err) { + mlx5_core_err(mdev, "Fail to create RoCE IPsec rx rules err=%d\n", err); + goto fail_setup_rule; + } + + kvfree(in); + return 0; + +fail_setup_rule: + mlx5_destroy_flow_table(roce->ft_rdma); +fail_rdma_table: + mlx5_destroy_flow_group(roce->roce_miss.group); +fail_mgroup: + mlx5_destroy_flow_group(roce->g); +fail_group: + kvfree(in); +fail_nomem: + mlx5_destroy_flow_table(roce->ft); + return err; +} + +void mlx5_ipsec_fs_roce_cleanup(struct mlx5_ipsec_fs *ipsec_roce) +{ + kfree(ipsec_roce); +} + +struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev) +{ + struct mlx5_ipsec_fs *roce_ipsec; + struct mlx5_flow_namespace *ns; + + ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_RDMA_RX_IPSEC); + if (!ns) { + mlx5_core_err(mdev, "Failed to get RoCE rx ns\n"); + return NULL; + } + + roce_ipsec = kzalloc(sizeof(*roce_ipsec), GFP_KERNEL); + if (!roce_ipsec) + return NULL; + + roce_ipsec->ipv4_rx.ns_rdma = ns; + roce_ipsec->ipv6_rx.ns_rdma = ns; + + return roce_ipsec; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h new file mode 100644 index 000000000000..9231b350bb0b --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */ + +#ifndef __MLX5_LIB_IPSEC_H__ +#define __MLX5_LIB_IPSEC_H__ + +struct mlx5_ipsec_fs; + +struct mlx5_flow_table * +mlx5_ipsec_fs_roce_ft_get(struct mlx5_ipsec_fs *ipsec_roce, u32 family); +void mlx5_ipsec_fs_roce_rx_destroy(struct mlx5_ipsec_fs *ipsec_roce, + u32 family); +int mlx5_ipsec_fs_roce_rx_create(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_fs *ipsec_roce, + struct mlx5_flow_namespace *ns, + struct mlx5_flow_destination *default_dst, + u32 family, u32 level, u32 prio); +void mlx5_ipsec_fs_roce_cleanup(struct mlx5_ipsec_fs *ipsec_roce); +struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev); + +#endif /* __MLX5_LIB_IPSEC_H__ */ From 22551e77e5507a06114c0af2b92bbf1a66ec33c5 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Wed, 4 Jan 2023 20:17:56 -0800 Subject: [PATCH 08/12] net/mlx5: Configure IPsec steering for egress RoCEv2 traffic Add steering table/rule in RDMA_TX domain, to forward all traffic to IPsec crypto table in NIC domain. Signed-off-by: Mark Zhang Signed-off-by: Patrisious Haddad Reviewed-by: Raed Salem Reviewed-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Leon Romanovsky --- .../mellanox/mlx5/core/en_accel/ipsec_fs.c | 14 ++- .../mlx5/core/en_accel/ipsec_offload.c | 3 +- .../mellanox/mlx5/core/lib/ipsec_fs_roce.c | 119 ++++++++++++++++++ .../mellanox/mlx5/core/lib/ipsec_fs_roce.h | 4 + 4 files changed, 137 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index f0a6db19272e..8cc3d3794ec7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -334,7 +334,8 @@ static void rx_ft_put(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, } /* IPsec TX flow steering */ -static int tx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_tx *tx) +static int tx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_tx *tx, + struct mlx5_ipsec_fs *roce) { struct mlx5_flow_destination dest = {}; struct mlx5_flow_table *ft; @@ -357,8 +358,15 @@ static int tx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec_tx *tx) err = ipsec_miss_create(mdev, tx->ft.pol, &tx->pol, &dest); if (err) goto err_pol_miss; + + err = mlx5_ipsec_fs_roce_tx_create(mdev, roce, tx->ft.pol); + if (err) + goto err_roce; return 0; +err_roce: + mlx5_del_flow_rules(tx->pol.rule); + mlx5_destroy_flow_group(tx->pol.group); err_pol_miss: mlx5_destroy_flow_table(tx->ft.pol); err_pol_ft: @@ -376,9 +384,10 @@ static struct mlx5e_ipsec_tx *tx_ft_get(struct mlx5_core_dev *mdev, if (tx->ft.refcnt) goto skip; - err = tx_create(mdev, tx); + err = tx_create(mdev, tx, ipsec->roce); if (err) goto out; + skip: tx->ft.refcnt++; out: @@ -397,6 +406,7 @@ static void tx_ft_put(struct mlx5e_ipsec *ipsec) if (tx->ft.refcnt) goto out; + mlx5_ipsec_fs_roce_tx_destroy(ipsec->roce); mlx5_del_flow_rules(tx->pol.rule); mlx5_destroy_flow_group(tx->pol.group); mlx5_destroy_flow_table(tx->ft.pol); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c index 436d939b95af..b5da6f71ee88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_offload.c @@ -43,7 +43,8 @@ u32 mlx5_ipsec_device_caps(struct mlx5_core_dev *mdev) caps |= MLX5_IPSEC_CAP_PACKET_OFFLOAD; if (mlx5_get_roce_state(mdev) && - (MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) & MLX5_FT_NIC_RX_2_NIC_RX_RDMA)) + MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) & MLX5_FT_NIC_RX_2_NIC_RX_RDMA && + MLX5_CAP_GEN_2(mdev, flow_table_type_2_type) & MLX5_FT_NIC_TX_RDMA_2_NIC_TX) caps |= MLX5_IPSEC_CAP_ROCE; if (!caps) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c index cfbbf90f5937..2c53589b765d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.c @@ -20,9 +20,17 @@ struct mlx5_ipsec_rx_roce { struct mlx5_flow_namespace *ns_rdma; }; +struct mlx5_ipsec_tx_roce { + struct mlx5_flow_group *g; + struct mlx5_flow_table *ft; + struct mlx5_flow_handle *rule; + struct mlx5_flow_namespace *ns; +}; + struct mlx5_ipsec_fs { struct mlx5_ipsec_rx_roce ipv4_rx; struct mlx5_ipsec_rx_roce ipv6_rx; + struct mlx5_ipsec_tx_roce tx; }; static void ipsec_fs_roce_setup_udp_dport(struct mlx5_flow_spec *spec, @@ -86,6 +94,105 @@ ipsec_fs_roce_rx_rule_setup(struct mlx5_core_dev *mdev, return err; } +static int ipsec_fs_roce_tx_rule_setup(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_tx_roce *roce, + struct mlx5_flow_table *pol_ft) +{ + struct mlx5_flow_destination dst = {}; + MLX5_DECLARE_FLOW_ACT(flow_act); + struct mlx5_flow_handle *rule; + int err = 0; + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dst.type = MLX5_FLOW_DESTINATION_TYPE_TABLE_TYPE; + dst.ft = pol_ft; + rule = mlx5_add_flow_rules(roce->ft, NULL, &flow_act, &dst, + 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Fail to add TX RoCE IPsec rule err=%d\n", + err); + goto out; + } + roce->rule = rule; + +out: + return err; +} + +void mlx5_ipsec_fs_roce_tx_destroy(struct mlx5_ipsec_fs *ipsec_roce) +{ + struct mlx5_ipsec_tx_roce *tx_roce; + + if (!ipsec_roce) + return; + + tx_roce = &ipsec_roce->tx; + + mlx5_del_flow_rules(tx_roce->rule); + mlx5_destroy_flow_group(tx_roce->g); + mlx5_destroy_flow_table(tx_roce->ft); +} + +#define MLX5_TX_ROCE_GROUP_SIZE BIT(0) + +int mlx5_ipsec_fs_roce_tx_create(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_fs *ipsec_roce, + struct mlx5_flow_table *pol_ft) +{ + struct mlx5_flow_table_attr ft_attr = {}; + struct mlx5_ipsec_tx_roce *roce; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *g; + int ix = 0; + int err; + u32 *in; + + if (!ipsec_roce) + return 0; + + roce = &ipsec_roce->tx; + + in = kvzalloc(MLX5_ST_SZ_BYTES(create_flow_group_in), GFP_KERNEL); + if (!in) + return -ENOMEM; + + ft_attr.max_fte = 1; + ft = mlx5_create_flow_table(roce->ns, &ft_attr); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + mlx5_core_err(mdev, "Fail to create RoCE IPsec tx ft err=%d\n", err); + return err; + } + + roce->ft = ft; + + MLX5_SET_CFG(in, start_flow_index, ix); + ix += MLX5_TX_ROCE_GROUP_SIZE; + MLX5_SET_CFG(in, end_flow_index, ix - 1); + g = mlx5_create_flow_group(ft, in); + if (IS_ERR(g)) { + err = PTR_ERR(g); + mlx5_core_err(mdev, "Fail to create RoCE IPsec tx group err=%d\n", err); + goto fail; + } + roce->g = g; + + err = ipsec_fs_roce_tx_rule_setup(mdev, roce, pol_ft); + if (err) { + mlx5_core_err(mdev, "Fail to create RoCE IPsec tx rules err=%d\n", err); + goto rule_fail; + } + + return 0; + +rule_fail: + mlx5_destroy_flow_group(roce->g); +fail: + mlx5_destroy_flow_table(ft); + return err; +} + struct mlx5_flow_table *mlx5_ipsec_fs_roce_ft_get(struct mlx5_ipsec_fs *ipsec_roce, u32 family) { struct mlx5_ipsec_rx_roce *rx_roce; @@ -245,5 +352,17 @@ struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev) roce_ipsec->ipv4_rx.ns_rdma = ns; roce_ipsec->ipv6_rx.ns_rdma = ns; + ns = mlx5_get_flow_namespace(mdev, MLX5_FLOW_NAMESPACE_RDMA_TX_IPSEC); + if (!ns) { + mlx5_core_err(mdev, "Failed to get RoCE tx ns\n"); + goto err_tx; + } + + roce_ipsec->tx.ns = ns; + return roce_ipsec; + +err_tx: + kfree(roce_ipsec); + return NULL; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h index 9231b350bb0b..9712d705fe48 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/ipsec_fs_roce.h @@ -15,6 +15,10 @@ int mlx5_ipsec_fs_roce_rx_create(struct mlx5_core_dev *mdev, struct mlx5_flow_namespace *ns, struct mlx5_flow_destination *default_dst, u32 family, u32 level, u32 prio); +void mlx5_ipsec_fs_roce_tx_destroy(struct mlx5_ipsec_fs *ipsec_roce); +int mlx5_ipsec_fs_roce_tx_create(struct mlx5_core_dev *mdev, + struct mlx5_ipsec_fs *ipsec_roce, + struct mlx5_flow_table *pol_ft); void mlx5_ipsec_fs_roce_cleanup(struct mlx5_ipsec_fs *ipsec_roce); struct mlx5_ipsec_fs *mlx5_ipsec_fs_roce_init(struct mlx5_core_dev *mdev); From 4b7296aa6c6618d6a6840fbe857e4990f626bd90 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Tue, 17 Jan 2023 15:14:49 +0200 Subject: [PATCH 09/12] net/mlx5: Expose bits for querying special mkeys Add needed HW bits to query the values of all special mkeys. Link: https://lore.kernel.org/r/080ebb563a9717c15b1ea75d669aede676df386b.1673960981.git.leon@kernel.org Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/linux/mlx5/mlx5_ifc.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index c3d3a2eef7d4..67cfac8fbe46 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1485,7 +1485,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 relaxed_ordering_write[0x1]; u8 relaxed_ordering_read[0x1]; u8 log_max_mkey[0x6]; - u8 reserved_at_f0[0x8]; + u8 reserved_at_f0[0x6]; + u8 terminate_scatter_list_mkey[0x1]; + u8 repeated_mkey[0x1]; u8 dump_fill_mkey[0x1]; u8 reserved_at_f9[0x2]; u8 fast_teardown[0x1]; @@ -5210,7 +5212,11 @@ struct mlx5_ifc_query_special_contexts_out_bits { u8 null_mkey[0x20]; - u8 reserved_at_a0[0x60]; + u8 terminate_scatter_list_mkey[0x20]; + + u8 repeated_mkey[0x20]; + + u8 reserved_at_a0[0x20]; }; struct mlx5_ifc_query_special_contexts_in_bits { From a419bfb7632095410adc3aecb1e863568f049add Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Tue, 17 Jan 2023 15:14:50 +0200 Subject: [PATCH 10/12] net/mlx5: Change define name for 0x100 lkey value Change define of 0x100 lkey value from MLX5_INVALID_LKEY to be MLX5_TERMINATE_SCATTER_LIST_LKEY as 0x100 is the value of terminate_scatter_list_mkey. Link: https://lore.kernel.org/r/3a116dc3fbae4cb6b76a63d27d418830b06ade0c.1673960981.git.leon@kernel.org Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 10 +++++----- drivers/infiniband/hw/mlx5/srq.c | 2 +- drivers/infiniband/hw/mlx5/wr.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++- include/linux/mlx5/qp.h | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e6e021af6aa9..b4ebeadce67c 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -986,7 +986,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, { int ret = 0, npages = 0; u64 io_virt; - u32 key; + __be32 key; u32 byte_count; size_t bcnt; int inline_segment; @@ -1000,7 +1000,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_wqe_data_seg *dseg = wqe; io_virt = be64_to_cpu(dseg->addr); - key = be32_to_cpu(dseg->lkey); + key = dseg->lkey; byte_count = be32_to_cpu(dseg->byte_count); inline_segment = !!(byte_count & MLX5_INLINE_SEG); bcnt = byte_count & ~MLX5_INLINE_SEG; @@ -1014,8 +1014,8 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, } /* receive WQE end of sg list. */ - if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY && - io_virt == 0) + if (receive_queue && bcnt == 0 && + key == MLX5_TERMINATE_SCATTER_LIST_LKEY && io_virt == 0) break; if (!inline_segment && total_wqe_bytes) { @@ -1034,7 +1034,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, continue; } - ret = pagefault_single_data_segment(dev, NULL, key, + ret = pagefault_single_data_segment(dev, NULL, be32_to_cpu(key), io_virt, bcnt, &pfault->bytes_committed, bytes_mapped); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 09b365a98bbf..0ac8d2c57365 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -447,7 +447,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, if (i < srq->msrq.max_avail_gather) { scat[i].byte_count = 0; - scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].lkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; scat[i].addr = 0; } } diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index 855f3f4fefad..bc44551493e2 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -1252,7 +1252,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (i < qp->rq.max_gs) { scat[i].byte_count = 0; - scat[i].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + scat[i].lkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; scat[i].addr = 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 85b51039d2a6..a941a4880cc5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -829,7 +829,8 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, /* check if num_frags is not a pow of two */ if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { wqe->data[f].byte_count = 0; - wqe->data[f].lkey = cpu_to_be32(MLX5_INVALID_LKEY); + wqe->data[f].lkey = + MLX5_TERMINATE_SCATTER_LIST_LKEY; wqe->data[f].addr = 0; } } diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 4657d5c54abe..df55fbb65717 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -36,7 +36,7 @@ #include #include -#define MLX5_INVALID_LKEY 0x100 +#define MLX5_TERMINATE_SCATTER_LIST_LKEY cpu_to_be32(0x100) /* UMR (3 WQE_BB's) + SIG (3 WQE_BB's) + PSV (mem) + PSV (wire) */ #define MLX5_SIG_WQE_SIZE (MLX5_SEND_WQE_BB * 8) #define MLX5_DIF_SIZE 8 From 1b1e4868836a4b5b375be75fd4c9583d29500517 Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Tue, 17 Jan 2023 15:14:51 +0200 Subject: [PATCH 11/12] net/mlx5e: Use query_special_contexts for mkeys Use query_sepcial_contexts in order to get the correct value of terminate_scatter_list_mkey, as FW will change it for certain configurations. Link: https://lore.kernel.org/r/fff70d94258233effb0e34f3d62cb08a692f5af5.1673960981.git.leon@kernel.org Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index a941a4880cc5..dd9f44dd79cd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -665,6 +665,26 @@ static void mlx5e_rq_free_shampo(struct mlx5e_rq *rq) mlx5e_rq_shampo_hd_free(rq); } +static __be32 mlx5e_get_terminate_scatter_list_mkey(struct mlx5_core_dev *dev) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + int res; + + if (!MLX5_CAP_GEN(dev, terminate_scatter_list_mkey)) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + res = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); + if (res) + return MLX5_TERMINATE_SCATTER_LIST_LKEY; + + res = MLX5_GET(query_special_contexts_out, out, + terminate_scatter_list_mkey); + return cpu_to_be32(res); +} + static int mlx5e_alloc_rq(struct mlx5e_params *params, struct mlx5e_xsk_param *xsk, struct mlx5e_rq_param *rqp, @@ -829,8 +849,7 @@ static int mlx5e_alloc_rq(struct mlx5e_params *params, /* check if num_frags is not a pow of two */ if (rq->wqe.info.num_frags < (1 << rq->wqe.info.log_num_frags)) { wqe->data[f].byte_count = 0; - wqe->data[f].lkey = - MLX5_TERMINATE_SCATTER_LIST_LKEY; + wqe->data[f].lkey = mlx5e_get_terminate_scatter_list_mkey(mdev); wqe->data[f].addr = 0; } } From 594cac11ab6a1be8022a3c96d181dde7cfb0b8cf Mon Sep 17 00:00:00 2001 From: Or Har-Toov Date: Tue, 17 Jan 2023 15:14:52 +0200 Subject: [PATCH 12/12] RDMA/mlx5: Use query_special_contexts for mkeys Use query_sepcial_contexts to get the correct value of mkeys such as null_mkey, terminate_scatter_list_mkey and dump_fill_mkey, as FW will change them in certain configurations. Link: https://lore.kernel.org/r/000236f0a9487d48809f87bcc3620a3964b2d3d3.1673960981.git.leon@kernel.org Signed-off-by: Or Har-Toov Reviewed-by: Michael Guralnik Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 45 ++++++++++++++++------------ drivers/infiniband/hw/mlx5/cmd.h | 3 +- drivers/infiniband/hw/mlx5/main.c | 10 +++---- drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 +++++- drivers/infiniband/hw/mlx5/odp.c | 21 ++++--------- drivers/infiniband/hw/mlx5/srq.c | 2 +- drivers/infiniband/hw/mlx5/wr.c | 2 +- 7 files changed, 48 insertions(+), 44 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index ff3742b0460a..1d0c8d5e745b 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -5,34 +5,41 @@ #include "cmd.h" -int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey) +int mlx5r_cmd_query_special_mkeys(struct mlx5_ib_dev *dev) { u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + bool is_terminate, is_dump, is_null; int err; + is_terminate = MLX5_CAP_GEN(dev->mdev, terminate_scatter_list_mkey); + is_dump = MLX5_CAP_GEN(dev->mdev, dump_fill_mkey); + is_null = MLX5_CAP_GEN(dev->mdev, null_mkey); + + dev->mkeys.terminate_scatter_list_mkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; + if (!is_terminate && !is_dump && !is_null) + return 0; + MLX5_SET(query_special_contexts_in, in, opcode, MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (!err) - *mkey = MLX5_GET(query_special_contexts_out, out, - dump_fill_mkey); - return err; -} + err = mlx5_cmd_exec_inout(dev->mdev, query_special_contexts, in, out); + if (err) + return err; -int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) -{ - u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; - u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; - int err; + if (is_dump) + dev->mkeys.dump_fill_mkey = MLX5_GET(query_special_contexts_out, + out, dump_fill_mkey); - MLX5_SET(query_special_contexts_in, in, opcode, - MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); - err = mlx5_cmd_exec_inout(dev, query_special_contexts, in, out); - if (!err) - *null_mkey = MLX5_GET(query_special_contexts_out, out, - null_mkey); - return err; + if (is_null) + dev->mkeys.null_mkey = cpu_to_be32( + MLX5_GET(query_special_contexts_out, out, null_mkey)); + + if (is_terminate) + dev->mkeys.terminate_scatter_list_mkey = + cpu_to_be32(MLX5_GET(query_special_contexts_out, out, + terminate_scatter_list_mkey)); + + return 0; } int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index ee46638db5de..93a971a40d11 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -37,8 +37,7 @@ #include #include -int mlx5_cmd_dump_fill_mkey(struct mlx5_core_dev *dev, u32 *mkey); -int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); +int mlx5r_cmd_query_special_mkeys(struct mlx5_ib_dev *dev); int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point, void *out); int mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index dc32e4518a28..e23db1924789 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1756,13 +1756,9 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_ucontext *context = to_mucontext(uctx); struct mlx5_bfreg_info *bfregi = &context->bfregi; - int err; if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { - err = mlx5_cmd_dump_fill_mkey(dev->mdev, - &resp->dump_fill_mkey); - if (err) - return err; + resp->dump_fill_mkey = dev->mkeys.dump_fill_mkey; resp->comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY; } @@ -3666,6 +3662,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->port[i].roce.last_port_state = IB_PORT_DOWN; } + err = mlx5r_cmd_query_special_mkeys(dev); + if (err) + return err; + err = mlx5_ib_init_multiport_master(dev); if (err) return err; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 7394e7f36ba7..ff65fec5918d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1054,6 +1054,13 @@ struct mlx5_port_caps { u8 ext_port_cap; }; + +struct mlx5_special_mkeys { + u32 dump_fill_mkey; + __be32 null_mkey; + __be32 terminate_scatter_list_mkey; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -1084,7 +1091,6 @@ struct mlx5_ib_dev { struct xarray odp_mkeys; - u32 null_mkey; struct mlx5_ib_flow_db *flow_db; /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; @@ -1113,6 +1119,7 @@ struct mlx5_ib_dev { struct mlx5_port_caps port_caps[MLX5_MAX_PORTS]; u16 pkey_table_len; u8 lag_ports; + struct mlx5_special_mkeys mkeys; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b4ebeadce67c..4998eaeadcbb 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -104,7 +104,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, if (flags & MLX5_IB_UPD_XLT_ZAP) { for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; pklm->va = 0; } return; @@ -137,7 +137,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(mr_to_mdev(imr)->null_mkey); + pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; pklm->va = 0; } } @@ -1015,7 +1015,8 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, /* receive WQE end of sg list. */ if (receive_queue && bcnt == 0 && - key == MLX5_TERMINATE_SCATTER_LIST_LKEY && io_virt == 0) + key == dev->mkeys.terminate_scatter_list_mkey && + io_virt == 0) break; if (!inline_segment && total_wqe_bytes) { @@ -1615,25 +1616,15 @@ static const struct ib_device_ops mlx5_ib_dev_odp_ops = { int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { - int ret = 0; - internal_fill_odp_caps(dev); if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT)) - return ret; + return 0; ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_odp_ops); - if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { - ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); - if (ret) { - mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); - return ret; - } - } - mutex_init(&dev->odp_eq_mutex); - return ret; + return 0; } void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 0ac8d2c57365..a056ea835da5 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -447,7 +447,7 @@ int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, if (i < srq->msrq.max_avail_gather) { scat[i].byte_count = 0; - scat[i].lkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; + scat[i].lkey = dev->mkeys.terminate_scatter_list_mkey; scat[i].addr = 0; } } diff --git a/drivers/infiniband/hw/mlx5/wr.c b/drivers/infiniband/hw/mlx5/wr.c index bc44551493e2..df1d1b0a3ef7 100644 --- a/drivers/infiniband/hw/mlx5/wr.c +++ b/drivers/infiniband/hw/mlx5/wr.c @@ -1252,7 +1252,7 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, if (i < qp->rq.max_gs) { scat[i].byte_count = 0; - scat[i].lkey = MLX5_TERMINATE_SCATTER_LIST_LKEY; + scat[i].lkey = dev->mkeys.terminate_scatter_list_mkey; scat[i].addr = 0; }