linux-stable/drivers/net/ethernet/mscc/ocelot.c

3026 lines
80 KiB
C
Raw Normal View History

// SPDX-License-Identifier: (GPL-2.0 OR MIT)
/*
* Microsemi Ocelot Switch driver
*
* Copyright (c) 2017 Microsemi Corporation
*/
#include <linux/dsa/ocelot.h>
#include <linux/if_bridge.h>
#include <linux/iopoll.h>
#include <linux/phy/phy.h>
#include <soc/mscc/ocelot_hsio.h>
#include <soc/mscc/ocelot_vcap.h>
#include "ocelot.h"
#include "ocelot_vcap.h"
#define TABLE_UPDATE_SLEEP_US 10
#define TABLE_UPDATE_TIMEOUT_US 100000
#define MEM_INIT_SLEEP_US 1000
#define MEM_INIT_TIMEOUT_US 100000
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
#define OCELOT_RSV_VLAN_RANGE_START 4000
struct ocelot_mact_entry {
u8 mac[ETH_ALEN];
u16 vid;
enum macaccess_entry_type type;
};
/* Caller must hold &ocelot->mact_lock */
static inline u32 ocelot_mact_read_macaccess(struct ocelot *ocelot)
{
return ocelot_read(ocelot, ANA_TABLES_MACACCESS);
}
/* Caller must hold &ocelot->mact_lock */
static inline int ocelot_mact_wait_for_completion(struct ocelot *ocelot)
{
u32 val;
return readx_poll_timeout(ocelot_mact_read_macaccess,
ocelot, val,
(val & ANA_TABLES_MACACCESS_MAC_TABLE_CMD_M) ==
MACACCESS_CMD_IDLE,
TABLE_UPDATE_SLEEP_US, TABLE_UPDATE_TIMEOUT_US);
}
/* Caller must hold &ocelot->mact_lock */
static void ocelot_mact_select(struct ocelot *ocelot,
const unsigned char mac[ETH_ALEN],
unsigned int vid)
{
u32 macl = 0, mach = 0;
/* Set the MAC address to handle and the vlan associated in a format
* understood by the hardware.
*/
mach |= vid << 16;
mach |= mac[0] << 8;
mach |= mac[1] << 0;
macl |= mac[2] << 24;
macl |= mac[3] << 16;
macl |= mac[4] << 8;
macl |= mac[5] << 0;
ocelot_write(ocelot, macl, ANA_TABLES_MACLDATA);
ocelot_write(ocelot, mach, ANA_TABLES_MACHDATA);
}
static int __ocelot_mact_learn(struct ocelot *ocelot, int port,
const unsigned char mac[ETH_ALEN],
unsigned int vid, enum macaccess_entry_type type)
{
u32 cmd = ANA_TABLES_MACACCESS_VALID |
ANA_TABLES_MACACCESS_DEST_IDX(port) |
ANA_TABLES_MACACCESS_ENTRYTYPE(type) |
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN);
unsigned int mc_ports;
int err;
/* Set MAC_CPU_COPY if the CPU port is used by a multicast entry */
if (type == ENTRYTYPE_MACv4)
mc_ports = (mac[1] << 8) | mac[2];
else if (type == ENTRYTYPE_MACv6)
mc_ports = (mac[0] << 8) | mac[1];
else
mc_ports = 0;
if (mc_ports & BIT(ocelot->num_phys_ports))
cmd |= ANA_TABLES_MACACCESS_MAC_CPU_COPY;
ocelot_mact_select(ocelot, mac, vid);
/* Issue a write command */
ocelot_write(ocelot, cmd, ANA_TABLES_MACACCESS);
err = ocelot_mact_wait_for_completion(ocelot);
return err;
}
int ocelot_mact_learn(struct ocelot *ocelot, int port,
const unsigned char mac[ETH_ALEN],
unsigned int vid, enum macaccess_entry_type type)
{
int ret;
mutex_lock(&ocelot->mact_lock);
ret = __ocelot_mact_learn(ocelot, port, mac, vid, type);
mutex_unlock(&ocelot->mact_lock);
return ret;
}
EXPORT_SYMBOL(ocelot_mact_learn);
int ocelot_mact_forget(struct ocelot *ocelot,
const unsigned char mac[ETH_ALEN], unsigned int vid)
{
int err;
mutex_lock(&ocelot->mact_lock);
ocelot_mact_select(ocelot, mac, vid);
/* Issue a forget command */
ocelot_write(ocelot,
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_FORGET),
ANA_TABLES_MACACCESS);
err = ocelot_mact_wait_for_completion(ocelot);
mutex_unlock(&ocelot->mact_lock);
return err;
}
EXPORT_SYMBOL(ocelot_mact_forget);
int ocelot_mact_lookup(struct ocelot *ocelot, int *dst_idx,
const unsigned char mac[ETH_ALEN],
unsigned int vid, enum macaccess_entry_type *type)
{
int val;
mutex_lock(&ocelot->mact_lock);
ocelot_mact_select(ocelot, mac, vid);
/* Issue a read command with MACACCESS_VALID=1. */
ocelot_write(ocelot, ANA_TABLES_MACACCESS_VALID |
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_READ),
ANA_TABLES_MACACCESS);
if (ocelot_mact_wait_for_completion(ocelot)) {
mutex_unlock(&ocelot->mact_lock);
return -ETIMEDOUT;
}
/* Read back the entry flags */
val = ocelot_read(ocelot, ANA_TABLES_MACACCESS);
mutex_unlock(&ocelot->mact_lock);
if (!(val & ANA_TABLES_MACACCESS_VALID))
return -ENOENT;
*dst_idx = ANA_TABLES_MACACCESS_DEST_IDX_X(val);
*type = ANA_TABLES_MACACCESS_ENTRYTYPE_X(val);
return 0;
}
EXPORT_SYMBOL(ocelot_mact_lookup);
int ocelot_mact_learn_streamdata(struct ocelot *ocelot, int dst_idx,
const unsigned char mac[ETH_ALEN],
unsigned int vid,
enum macaccess_entry_type type,
int sfid, int ssid)
{
int ret;
mutex_lock(&ocelot->mact_lock);
ocelot_write(ocelot,
(sfid < 0 ? 0 : ANA_TABLES_STREAMDATA_SFID_VALID) |
ANA_TABLES_STREAMDATA_SFID(sfid) |
(ssid < 0 ? 0 : ANA_TABLES_STREAMDATA_SSID_VALID) |
ANA_TABLES_STREAMDATA_SSID(ssid),
ANA_TABLES_STREAMDATA);
ret = __ocelot_mact_learn(ocelot, dst_idx, mac, vid, type);
mutex_unlock(&ocelot->mact_lock);
return ret;
}
EXPORT_SYMBOL(ocelot_mact_learn_streamdata);
static void ocelot_mact_init(struct ocelot *ocelot)
{
/* Configure the learning mode entries attributes:
* - Do not copy the frame to the CPU extraction queues.
* - Use the vlan and mac_cpoy for dmac lookup.
*/
ocelot_rmw(ocelot, 0,
ANA_AGENCTRL_LEARN_CPU_COPY | ANA_AGENCTRL_IGNORE_DMAC_FLAGS
| ANA_AGENCTRL_LEARN_FWD_KILL
| ANA_AGENCTRL_LEARN_IGNORE_VLAN,
ANA_AGENCTRL);
/* Clear the MAC table. We are not concurrent with anyone, so
* holding &ocelot->mact_lock is pointless.
*/
ocelot_write(ocelot, MACACCESS_CMD_INIT, ANA_TABLES_MACACCESS);
}
void ocelot_pll5_init(struct ocelot *ocelot)
{
/* Configure PLL5. This will need a proper CCF driver
* The values are coming from the VTSS API for Ocelot
*/
regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG4,
HSIO_PLL5G_CFG4_IB_CTRL(0x7600) |
HSIO_PLL5G_CFG4_IB_BIAS_CTRL(0x8));
regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG0,
HSIO_PLL5G_CFG0_CORE_CLK_DIV(0x11) |
HSIO_PLL5G_CFG0_CPU_CLK_DIV(2) |
HSIO_PLL5G_CFG0_ENA_BIAS |
HSIO_PLL5G_CFG0_ENA_VCO_BUF |
HSIO_PLL5G_CFG0_ENA_CP1 |
HSIO_PLL5G_CFG0_SELCPI(2) |
HSIO_PLL5G_CFG0_LOOP_BW_RES(0xe) |
HSIO_PLL5G_CFG0_SELBGV820(4) |
HSIO_PLL5G_CFG0_DIV4 |
HSIO_PLL5G_CFG0_ENA_CLKTREE |
HSIO_PLL5G_CFG0_ENA_LANE);
regmap_write(ocelot->targets[HSIO], HSIO_PLL5G_CFG2,
HSIO_PLL5G_CFG2_EN_RESET_FRQ_DET |
HSIO_PLL5G_CFG2_EN_RESET_OVERRUN |
HSIO_PLL5G_CFG2_GAIN_TEST(0x8) |
HSIO_PLL5G_CFG2_ENA_AMPCTRL |
HSIO_PLL5G_CFG2_PWD_AMPCTRL_N |
HSIO_PLL5G_CFG2_AMPC_SEL(0x10));
}
EXPORT_SYMBOL(ocelot_pll5_init);
static void ocelot_vcap_enable(struct ocelot *ocelot, int port)
{
ocelot_write_gix(ocelot, ANA_PORT_VCAP_S2_CFG_S2_ENA |
ANA_PORT_VCAP_S2_CFG_S2_IP6_CFG(0xa),
ANA_PORT_VCAP_S2_CFG, port);
ocelot_write_gix(ocelot, ANA_PORT_VCAP_CFG_S1_ENA,
ANA_PORT_VCAP_CFG, port);
ocelot_rmw_gix(ocelot, REW_PORT_CFG_ES0_EN,
REW_PORT_CFG_ES0_EN,
REW_PORT_CFG, port);
}
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
static int ocelot_single_vlan_aware_bridge(struct ocelot *ocelot,
struct netlink_ext_ack *extack)
{
struct net_device *bridge = NULL;
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port || !ocelot_port->bridge ||
!br_vlan_enabled(ocelot_port->bridge))
continue;
if (!bridge) {
bridge = ocelot_port->bridge;
continue;
}
if (bridge == ocelot_port->bridge)
continue;
NL_SET_ERR_MSG_MOD(extack,
"Only one VLAN-aware bridge is supported");
return -EBUSY;
}
return 0;
}
static inline u32 ocelot_vlant_read_vlanaccess(struct ocelot *ocelot)
{
return ocelot_read(ocelot, ANA_TABLES_VLANACCESS);
}
static inline int ocelot_vlant_wait_for_completion(struct ocelot *ocelot)
{
u32 val;
return readx_poll_timeout(ocelot_vlant_read_vlanaccess,
ocelot,
val,
(val & ANA_TABLES_VLANACCESS_VLAN_TBL_CMD_M) ==
ANA_TABLES_VLANACCESS_CMD_IDLE,
TABLE_UPDATE_SLEEP_US, TABLE_UPDATE_TIMEOUT_US);
}
static int ocelot_vlant_set_mask(struct ocelot *ocelot, u16 vid, u32 mask)
{
/* Select the VID to configure */
ocelot_write(ocelot, ANA_TABLES_VLANTIDX_V_INDEX(vid),
ANA_TABLES_VLANTIDX);
/* Set the vlan port members mask and issue a write command */
ocelot_write(ocelot, ANA_TABLES_VLANACCESS_VLAN_PORT_MASK(mask) |
ANA_TABLES_VLANACCESS_CMD_WRITE,
ANA_TABLES_VLANACCESS);
return ocelot_vlant_wait_for_completion(ocelot);
}
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
static int ocelot_port_num_untagged_vlans(struct ocelot *ocelot, int port)
{
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
struct ocelot_bridge_vlan *vlan;
int num_untagged = 0;
list_for_each_entry(vlan, &ocelot->vlans, list) {
if (!(vlan->portmask & BIT(port)))
continue;
net: mscc: ocelot: fix tagged VLAN refusal while under a VLAN-unaware bridge Currently the following set of commands fails: $ ip link add br0 type bridge # vlan_filtering 0 $ ip link set swp0 master br0 $ bridge vlan port vlan-id swp0 1 PVID Egress Untagged $ bridge vlan add dev swp0 vid 10 Error: mscc_ocelot_switch_lib: Port with more than one egress-untagged VLAN cannot have egress-tagged VLANs. Dumping ocelot->vlans, one can see that the 2 egress-untagged VLANs on swp0 are vid 1 (the bridge PVID) and vid 4094, a PVID used privately by the driver for VLAN-unaware bridging. So this is why bridge vid 10 is refused, despite 'bridge vlan' showing a single egress untagged VLAN. As mentioned in the comment added, having this private VLAN does not impose restrictions to the hardware configuration, yet it is a bookkeeping problem. There are 2 possible solutions. One is to make the functions that operate on VLAN-unaware pvids: - ocelot_add_vlan_unaware_pvid() - ocelot_del_vlan_unaware_pvid() - ocelot_port_setup_dsa_8021q_cpu() - ocelot_port_teardown_dsa_8021q_cpu() call something different than ocelot_vlan_member_(add|del)(), the latter being the real problem, because it allocates a struct ocelot_bridge_vlan *vlan which it adds to ocelot->vlans. We don't really *need* the private VLANs in ocelot->vlans, it's just that we have the extra convenience of having the vlan->portmask cached in software (whereas without these structures, we'd have to create a raw ocelot_vlant_rmw_mask() procedure which reads back the current port mask from hardware). The other solution is to filter out the private VLANs from ocelot_port_num_untagged_vlans(), since they aren't what callers care about. We only need to do this to the mentioned function and not to ocelot_port_num_tagged_vlans(), because private VLANs are never egress-tagged. Nothing else seems to be broken in either solution, but the first one requires more rework which will conflict with the net-next change 36a0bf443585 ("net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity"), and I'd like to avoid that. So go with the other one. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220927122042.1100231-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-09-27 12:20:42 +00:00
/* Ignore the VLAN added by ocelot_add_vlan_unaware_pvid(),
* because this is never active in hardware at the same time as
* the bridge VLANs, which only matter in VLAN-aware mode.
*/
if (vlan->vid >= OCELOT_RSV_VLAN_RANGE_START)
continue;
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
if (vlan->untagged & BIT(port))
num_untagged++;
}
return num_untagged;
}
static int ocelot_port_num_tagged_vlans(struct ocelot *ocelot, int port)
{
struct ocelot_bridge_vlan *vlan;
int num_tagged = 0;
list_for_each_entry(vlan, &ocelot->vlans, list) {
if (!(vlan->portmask & BIT(port)))
continue;
if (!(vlan->untagged & BIT(port)))
num_tagged++;
}
return num_tagged;
}
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
/* We use native VLAN when we have to mix egress-tagged VLANs with exactly
* _one_ egress-untagged VLAN (_the_ native VLAN)
*/
static bool ocelot_port_uses_native_vlan(struct ocelot *ocelot, int port)
{
return ocelot_port_num_tagged_vlans(ocelot, port) &&
ocelot_port_num_untagged_vlans(ocelot, port) == 1;
}
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
static struct ocelot_bridge_vlan *
ocelot_port_find_native_vlan(struct ocelot *ocelot, int port)
{
struct ocelot_bridge_vlan *vlan;
list_for_each_entry(vlan, &ocelot->vlans, list)
if (vlan->portmask & BIT(port) && vlan->untagged & BIT(port))
return vlan;
return NULL;
}
/* Keep in sync REW_TAG_CFG_TAG_CFG and, if applicable,
* REW_PORT_VLAN_CFG_PORT_VID, with the bridge VLAN table and VLAN awareness
* state of the port.
*/
static void ocelot_port_manage_port_tag(struct ocelot *ocelot, int port)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
enum ocelot_port_tag_config tag_cfg;
bool uses_native_vlan = false;
net: mscc: ocelot: fix untagged packet drops when enslaving to vlan aware bridge To rehash a previous explanation given in commit 1c44ce560b4d ("net: mscc: ocelot: fix vlan_filtering when enslaving to bridge before link is up"), the switch driver operates the in a mode where a single VLAN can be transmitted as untagged on a particular egress port. That is the "native VLAN on trunk port" use case. The configuration for this native VLAN is driven in 2 ways: - Set the egress port rewriter to strip the VLAN tag for the native VID (as it is egress-untagged, after all). - Configure the ingress port to drop untagged and priority-tagged traffic, if there is no native VLAN. The intention of this setting is that a trunk port with no native VLAN should not accept untagged traffic. Since both of the above configurations for the native VLAN should only be done if VLAN awareness is requested, they are actually done from the ocelot_port_vlan_filtering function, after the basic procedure of toggling the VLAN awareness flag of the port. But there's a problem with that simplistic approach: we are trying to juggle with 2 independent variables from a single function: - Native VLAN of the port - its value is held in port->vid. - VLAN awareness state of the port - currently there are some issues here, more on that later*. The actual problem can be seen when enslaving the switch ports to a VLAN filtering bridge: 0. The driver configures a pvid of zero for each port, when in standalone mode. While the bridge configures a default_pvid of 1 for each port that gets added as a slave to it. 1. The bridge calls ocelot_port_vlan_filtering with vlan_aware=true. The VLAN-filtering-dependent portion of the native VLAN configuration is done, considering that the native VLAN is 0. 2. The bridge calls ocelot_vlan_add with vid=1, pvid=true, untagged=true. The native VLAN changes to 1 (change which gets propagated to hardware). 3. ??? - nobody calls ocelot_port_vlan_filtering again, to reapply the VLAN-filtering-dependent portion of the native VLAN configuration, for the new native VLAN of 1. One can notice that after toggling "ip link set dev br0 type bridge vlan_filtering 0 && ip link set dev br0 type bridge vlan_filtering 1", the new native VLAN finally makes it through and untagged traffic finally starts flowing again. But obviously that shouldn't be needed. So it is clear that 2 independent variables need to both re-trigger the native VLAN configuration. So we introduce the second variable as ocelot_port->vlan_aware. *Actually both the DSA Felix driver and the Ocelot driver already had each its own variable: - Ocelot: ocelot_port_private->vlan_aware - Felix: dsa_port->vlan_filtering but the common Ocelot library needs to work with a single, common, variable, so there is some refactoring done to move the vlan_aware property from the private structure into the common ocelot_port structure. Fixes: 97bb69e1e36e ("net: mscc: ocelot: break apart ocelot_vlan_port_apply") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-14 19:36:15 +00:00
if (ocelot_port->vlan_aware) {
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
uses_native_vlan = ocelot_port_uses_native_vlan(ocelot, port);
if (uses_native_vlan)
tag_cfg = OCELOT_PORT_TAG_NATIVE;
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
else if (ocelot_port_num_untagged_vlans(ocelot, port))
tag_cfg = OCELOT_PORT_TAG_DISABLED;
else
tag_cfg = OCELOT_PORT_TAG_TRUNK;
} else {
tag_cfg = OCELOT_PORT_TAG_DISABLED;
}
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
ocelot_rmw_gix(ocelot, REW_TAG_CFG_TAG_CFG(tag_cfg),
REW_TAG_CFG_TAG_CFG_M,
REW_TAG_CFG, port);
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
if (uses_native_vlan) {
struct ocelot_bridge_vlan *native_vlan;
/* Not having a native VLAN is impossible, because
* ocelot_port_num_untagged_vlans has returned 1.
* So there is no use in checking for NULL here.
*/
native_vlan = ocelot_port_find_native_vlan(ocelot, port);
ocelot_rmw_gix(ocelot,
REW_PORT_VLAN_CFG_PORT_VID(native_vlan->vid),
REW_PORT_VLAN_CFG_PORT_VID_M,
REW_PORT_VLAN_CFG, port);
}
}
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
int ocelot_bridge_num_find(struct ocelot *ocelot,
const struct net_device *bridge)
{
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (ocelot_port && ocelot_port->bridge == bridge)
return ocelot_port->bridge_num;
}
return -1;
}
EXPORT_SYMBOL_GPL(ocelot_bridge_num_find);
static u16 ocelot_vlan_unaware_pvid(struct ocelot *ocelot,
const struct net_device *bridge)
{
int bridge_num;
/* Standalone ports use VID 0 */
if (!bridge)
return 0;
bridge_num = ocelot_bridge_num_find(ocelot, bridge);
if (WARN_ON(bridge_num < 0))
return 0;
/* VLAN-unaware bridges use a reserved VID going from 4095 downwards */
return VLAN_N_VID - bridge_num - 1;
}
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
/* Default vlan to clasify for untagged frames (may be zero) */
static void ocelot_port_set_pvid(struct ocelot *ocelot, int port,
const struct ocelot_bridge_vlan *pvid_vlan)
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
u16 pvid = ocelot_vlan_unaware_pvid(ocelot, ocelot_port->bridge);
net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion Currently, the ocelot_port_set_native_vlan() function starts dropping untagged and prio-tagged traffic when the native VLAN is removed? What is the native VLAN? It is the only egress-untagged VLAN that ocelot supports on a port. If the port is a trunk with 100 VLANs, one of those VLANs can be transmitted as egress-untagged, and that's the native VLAN. Is it wrong to drop untagged and prio-tagged traffic if there's no native VLAN? Yes and no. In this case, which is more typical, it's ok to apply that drop configuration: $ bridge vlan add dev swp0 vid 1 pvid untagged <- this is the native VLAN $ bridge vlan add dev swp0 vid 100 $ bridge vlan add dev swp0 vid 101 $ bridge vlan del dev swp0 vid 1 <- delete the native VLAN But only because the pvid and the native VLAN have the same ID. In this case, it isn't: $ bridge vlan add dev swp0 vid 1 pvid $ bridge vlan add dev swp0 vid 100 untagged <- this is the native VLAN $ bridge vlan del dev swp0 vid 101 $ bridge vlan del dev swp0 vid 100 <- delete the native VLAN It's wrong, because the switch will drop untagged and prio-tagged traffic now, despite having a valid pvid of 1. The confusion seems to stem from the fact that the native VLAN is an egress setting, while the PVID is an ingress setting. It would be correct to drop untagged and prio-tagged traffic only if there was no pvid on the port. So let's do just that. Background: https://lore.kernel.org/netdev/CA+h21hrRMrLH-RjBGhEJSTZd6_QPRSd3RkVRQF-wNKkrgKcRSA@mail.gmail.com/#t Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:14 +00:00
u32 val = 0;
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
ocelot_port->pvid_vlan = pvid_vlan;
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
if (ocelot_port->vlan_aware && pvid_vlan)
pvid = pvid_vlan->vid;
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
ocelot_rmw_gix(ocelot,
ANA_PORT_VLAN_CFG_VLAN_VID(pvid),
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
ANA_PORT_VLAN_CFG_VLAN_VID_M,
ANA_PORT_VLAN_CFG, port);
net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion Currently, the ocelot_port_set_native_vlan() function starts dropping untagged and prio-tagged traffic when the native VLAN is removed? What is the native VLAN? It is the only egress-untagged VLAN that ocelot supports on a port. If the port is a trunk with 100 VLANs, one of those VLANs can be transmitted as egress-untagged, and that's the native VLAN. Is it wrong to drop untagged and prio-tagged traffic if there's no native VLAN? Yes and no. In this case, which is more typical, it's ok to apply that drop configuration: $ bridge vlan add dev swp0 vid 1 pvid untagged <- this is the native VLAN $ bridge vlan add dev swp0 vid 100 $ bridge vlan add dev swp0 vid 101 $ bridge vlan del dev swp0 vid 1 <- delete the native VLAN But only because the pvid and the native VLAN have the same ID. In this case, it isn't: $ bridge vlan add dev swp0 vid 1 pvid $ bridge vlan add dev swp0 vid 100 untagged <- this is the native VLAN $ bridge vlan del dev swp0 vid 101 $ bridge vlan del dev swp0 vid 100 <- delete the native VLAN It's wrong, because the switch will drop untagged and prio-tagged traffic now, despite having a valid pvid of 1. The confusion seems to stem from the fact that the native VLAN is an egress setting, while the PVID is an ingress setting. It would be correct to drop untagged and prio-tagged traffic only if there was no pvid on the port. So let's do just that. Background: https://lore.kernel.org/netdev/CA+h21hrRMrLH-RjBGhEJSTZd6_QPRSd3RkVRQF-wNKkrgKcRSA@mail.gmail.com/#t Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:14 +00:00
/* If there's no pvid, we should drop not only untagged traffic (which
* happens automatically), but also 802.1p traffic which gets
* classified to VLAN 0, but that is always in our RX filter, so it
* would get accepted were it not for this setting.
*/
if (!pvid_vlan && ocelot_port->vlan_aware)
net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion Currently, the ocelot_port_set_native_vlan() function starts dropping untagged and prio-tagged traffic when the native VLAN is removed? What is the native VLAN? It is the only egress-untagged VLAN that ocelot supports on a port. If the port is a trunk with 100 VLANs, one of those VLANs can be transmitted as egress-untagged, and that's the native VLAN. Is it wrong to drop untagged and prio-tagged traffic if there's no native VLAN? Yes and no. In this case, which is more typical, it's ok to apply that drop configuration: $ bridge vlan add dev swp0 vid 1 pvid untagged <- this is the native VLAN $ bridge vlan add dev swp0 vid 100 $ bridge vlan add dev swp0 vid 101 $ bridge vlan del dev swp0 vid 1 <- delete the native VLAN But only because the pvid and the native VLAN have the same ID. In this case, it isn't: $ bridge vlan add dev swp0 vid 1 pvid $ bridge vlan add dev swp0 vid 100 untagged <- this is the native VLAN $ bridge vlan del dev swp0 vid 101 $ bridge vlan del dev swp0 vid 100 <- delete the native VLAN It's wrong, because the switch will drop untagged and prio-tagged traffic now, despite having a valid pvid of 1. The confusion seems to stem from the fact that the native VLAN is an egress setting, while the PVID is an ingress setting. It would be correct to drop untagged and prio-tagged traffic only if there was no pvid on the port. So let's do just that. Background: https://lore.kernel.org/netdev/CA+h21hrRMrLH-RjBGhEJSTZd6_QPRSd3RkVRQF-wNKkrgKcRSA@mail.gmail.com/#t Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:14 +00:00
val = ANA_PORT_DROP_CFG_DROP_PRIO_S_TAGGED_ENA |
ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA;
ocelot_rmw_gix(ocelot, val,
ANA_PORT_DROP_CFG_DROP_PRIO_S_TAGGED_ENA |
ANA_PORT_DROP_CFG_DROP_PRIO_C_TAGGED_ENA,
ANA_PORT_DROP_CFG, port);
net: mscc: ocelot: use the pvid of zero when bridged with vlan_filtering=0 Currently, mscc_ocelot ports configure pvid=0 in standalone mode, and inherit the pvid from the bridge when one is present. When the bridge has vlan_filtering=0, the software semantics are that packets should be received regardless of whether there's a pvid configured on the ingress port or not. However, ocelot does not observe those semantics today. Moreover, changing the PVID is also a problem with vlan_filtering=0. We are privately remapping the VID of FDB, MDB entries to the port's PVID when those are VLAN-unaware (i.e. when the VID of these entries comes to us as 0). But we have no logic of adjusting that remapping when the user changes the pvid and vlan_filtering is 0. So stale entries would be left behind, and untagged traffic will stop matching on them. And even if we were to solve that, there's an even bigger problem. If swp0 has pvid 1, and swp1 has pvid 2, and both are under a vlan_filtering=0 bridge, they should be able to forward traffic between one another. However, with ocelot they wouldn't do that. The simplest way of fixing this is to never configure the pvid based on what the bridge is asking for, when vlan_filtering is 0. Only if there was a VLAN that the bridge couldn't mangle, that we could use as pvid.... So, turns out, there's 0 just for that. And for a reason: IEEE 802.1Q-2018, page 247, Table 9-2-Reserved VID values says: The null VID. Indicates that the tag header contains only priority information; no VID is present in the frame. This VID value shall not be configured as a PVID or a member ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ of a VID Set, or configured in any FDB entry, or used in any Management operation. So, aren't we doing exactly what 802.1Q says not to? Well, in a way, but what we're doing here is just driver-level bookkeeping, all for the better. The fact that we're using a pvid of 0 is not observable behavior from the outside world: the network stack does not see the classified VLAN that the switch uses, in vlan_filtering=0 mode. And we're also more consistent with the standalone mode now. And now that we use the pvid of 0 in this mode, there's another advantage: we don't need to perform any VID remapping for FDB and MDB entries either, we can just use the VID of 0 that the bridge is passing to us. The only gotcha is that every time we change the vlan_filtering setting, we need to reapply the pvid (either to 0, or to the value from the bridge). A small side-effect visible in the patch is that ocelot_port_set_pvid needs to be moved above ocelot_port_vlan_filtering, so that it can be called from there without forward-declarations. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:10 +00:00
}
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
static struct ocelot_bridge_vlan *ocelot_bridge_vlan_find(struct ocelot *ocelot,
u16 vid)
{
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
struct ocelot_bridge_vlan *vlan;
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
list_for_each_entry(vlan, &ocelot->vlans, list)
if (vlan->vid == vid)
return vlan;
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
return NULL;
}
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
static int ocelot_vlan_member_add(struct ocelot *ocelot, int port, u16 vid,
bool untagged)
{
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
struct ocelot_bridge_vlan *vlan = ocelot_bridge_vlan_find(ocelot, vid);
unsigned long portmask;
int err;
if (vlan) {
portmask = vlan->portmask | BIT(port);
err = ocelot_vlant_set_mask(ocelot, vid, portmask);
if (err)
return err;
vlan->portmask = portmask;
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
/* Bridge VLANs can be overwritten with a different
* egress-tagging setting, so make sure to override an untagged
* with a tagged VID if that's going on.
*/
if (untagged)
vlan->untagged |= BIT(port);
else
vlan->untagged &= ~BIT(port);
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
return 0;
}
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
if (!vlan)
return -ENOMEM;
portmask = BIT(port);
err = ocelot_vlant_set_mask(ocelot, vid, portmask);
if (err) {
kfree(vlan);
return err;
}
vlan->vid = vid;
vlan->portmask = portmask;
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
if (untagged)
vlan->untagged = BIT(port);
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
INIT_LIST_HEAD(&vlan->list);
list_add_tail(&vlan->list, &ocelot->vlans);
return 0;
}
static int ocelot_vlan_member_del(struct ocelot *ocelot, int port, u16 vid)
{
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
struct ocelot_bridge_vlan *vlan = ocelot_bridge_vlan_find(ocelot, vid);
unsigned long portmask;
int err;
if (!vlan)
return 0;
portmask = vlan->portmask & ~BIT(port);
err = ocelot_vlant_set_mask(ocelot, vid, portmask);
if (err)
return err;
vlan->portmask = portmask;
if (vlan->portmask)
return 0;
list_del(&vlan->list);
kfree(vlan);
return 0;
}
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
static int ocelot_add_vlan_unaware_pvid(struct ocelot *ocelot, int port,
const struct net_device *bridge)
{
u16 vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
return ocelot_vlan_member_add(ocelot, port, vid, true);
}
static int ocelot_del_vlan_unaware_pvid(struct ocelot *ocelot, int port,
const struct net_device *bridge)
{
u16 vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
return ocelot_vlan_member_del(ocelot, port, vid);
}
net: dsa: propagate switchdev vlan_filtering prepare phase to drivers A driver may refuse to enable VLAN filtering for any reason beyond what the DSA framework cares about, such as: - having tc-flower rules that rely on the switch being VLAN-aware - the particular switch does not support VLAN, even if the driver does (the DSA framework just checks for the presence of the .port_vlan_add and .port_vlan_del pointers) - simply not supporting this configuration to be toggled at runtime Currently, when a driver rejects a configuration it cannot support, it does this from the commit phase, which triggers various warnings in switchdev. So propagate the prepare phase to drivers, to give them the ability to refuse invalid configurations cleanly and avoid the warnings. Since we need to modify all function prototypes and check for the prepare phase from within the drivers, take that opportunity and move the existing driver restrictions within the prepare phase where that is possible and easy. Cc: Florian Fainelli <f.fainelli@gmail.com> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com> Cc: Hauke Mehrtens <hauke@hauke-m.de> Cc: Woojung Huh <woojung.huh@microchip.com> Cc: Microchip Linux Driver Support <UNGLinuxDriver@microchip.com> Cc: Sean Wang <sean.wang@mediatek.com> Cc: Landen Chao <Landen.Chao@mediatek.com> Cc: Andrew Lunn <andrew@lunn.ch> Cc: Vivien Didelot <vivien.didelot@gmail.com> Cc: Jonathan McDowell <noodles@earth.li> Cc: Linus Walleij <linus.walleij@linaro.org> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-10-02 22:06:46 +00:00
int ocelot_port_vlan_filtering(struct ocelot *ocelot, int port,
bool vlan_aware, struct netlink_ext_ack *extack)
{
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 00:01:50 +00:00
struct ocelot_vcap_block *block = &ocelot->block[VCAP_IS1];
struct ocelot_port *ocelot_port = ocelot->ports[port];
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 00:01:50 +00:00
struct ocelot_vcap_filter *filter;
net: mscc: ocelot: don't add VID 0 to ocelot->vlans when leaving VLAN-aware bridge DSA, through dsa_port_bridge_leave(), first notifies the port of the fact that it left a bridge, then, if that bridge was VLAN-aware, it notifies the port of the change in VLAN awareness state, towards VLAN-unaware mode. So ocelot_port_vlan_filtering() can be called when ocelot_port->bridge is NULL, and this makes ocelot_add_vlan_unaware_pvid() create a struct ocelot_bridge_vlan with a vid of 0 and an "untagged" setting of true on that port. In a way this structure correctly reflects the reality, but by design, VID 0 (OCELOT_STANDALONE_PVID) was not meant to be kept in the bridge VLAN list of the driver, but managed separately. Having OCELOT_STANDALONE_PVID in ocelot->vlans makes us trip up on several sanity checks that did not expect to have this VID there. For example, after we leave a VLAN-aware bridge and we re-join it, we can no longer program egress-tagged VLANs to hardware: # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up # ip link set swp0 master br0 # ip link set swp0 nomaster # ip link set swp0 master br0 # bridge vlan add dev swp0 vid 100 Error: mscc_ocelot_switch_lib: Port with more than one egress-untagged VLAN cannot have egress-tagged VLANs. But this configuration is in fact supported by the hardware, since we could use OCELOT_PORT_TAG_NATIVE. According to its comment: /* all VLANs except the native VLAN and VID 0 are egress-tagged */ yet when assessing the eligibility for this mode, we do not check for VID 0 in ocelot_port_uses_native_vlan(), instead we just ensure that ocelot_port_num_untagged_vlans() == 1. This is simply because VID 0 doesn't have a bridge VLAN structure. The way I identify the problem is that ocelot_port_vlan_filtering(false) only means to call ocelot_add_vlan_unaware_pvid() when we dynamically turn off VLAN awareness for a bridge we are under, and the PVID changes from the bridge PVID to a reserved PVID based on the bridge number. Since OCELOT_STANDALONE_PVID is statically added to the VLAN table during ocelot_vlan_init() and never removed afterwards, calling ocelot_add_vlan_unaware_pvid() for it is not intended and does not serve any purpose. Fix the issue by avoiding the call to ocelot_add_vlan_unaware_pvid(vid=0) when we're resetting VLAN awareness after leaving the bridge, to become a standalone port. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-21 23:01:05 +00:00
int err = 0;
net: mscc: ocelot: fix untagged packet drops when enslaving to vlan aware bridge To rehash a previous explanation given in commit 1c44ce560b4d ("net: mscc: ocelot: fix vlan_filtering when enslaving to bridge before link is up"), the switch driver operates the in a mode where a single VLAN can be transmitted as untagged on a particular egress port. That is the "native VLAN on trunk port" use case. The configuration for this native VLAN is driven in 2 ways: - Set the egress port rewriter to strip the VLAN tag for the native VID (as it is egress-untagged, after all). - Configure the ingress port to drop untagged and priority-tagged traffic, if there is no native VLAN. The intention of this setting is that a trunk port with no native VLAN should not accept untagged traffic. Since both of the above configurations for the native VLAN should only be done if VLAN awareness is requested, they are actually done from the ocelot_port_vlan_filtering function, after the basic procedure of toggling the VLAN awareness flag of the port. But there's a problem with that simplistic approach: we are trying to juggle with 2 independent variables from a single function: - Native VLAN of the port - its value is held in port->vid. - VLAN awareness state of the port - currently there are some issues here, more on that later*. The actual problem can be seen when enslaving the switch ports to a VLAN filtering bridge: 0. The driver configures a pvid of zero for each port, when in standalone mode. While the bridge configures a default_pvid of 1 for each port that gets added as a slave to it. 1. The bridge calls ocelot_port_vlan_filtering with vlan_aware=true. The VLAN-filtering-dependent portion of the native VLAN configuration is done, considering that the native VLAN is 0. 2. The bridge calls ocelot_vlan_add with vid=1, pvid=true, untagged=true. The native VLAN changes to 1 (change which gets propagated to hardware). 3. ??? - nobody calls ocelot_port_vlan_filtering again, to reapply the VLAN-filtering-dependent portion of the native VLAN configuration, for the new native VLAN of 1. One can notice that after toggling "ip link set dev br0 type bridge vlan_filtering 0 && ip link set dev br0 type bridge vlan_filtering 1", the new native VLAN finally makes it through and untagged traffic finally starts flowing again. But obviously that shouldn't be needed. So it is clear that 2 independent variables need to both re-trigger the native VLAN configuration. So we introduce the second variable as ocelot_port->vlan_aware. *Actually both the DSA Felix driver and the Ocelot driver already had each its own variable: - Ocelot: ocelot_port_private->vlan_aware - Felix: dsa_port->vlan_filtering but the common Ocelot library needs to work with a single, common, variable, so there is some refactoring done to move the vlan_aware property from the private structure into the common ocelot_port structure. Fixes: 97bb69e1e36e ("net: mscc: ocelot: break apart ocelot_vlan_port_apply") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-14 19:36:15 +00:00
u32 val;
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 00:01:50 +00:00
list_for_each_entry(filter, &block->rules, list) {
if (filter->ingress_port_mask & BIT(port) &&
filter->action.vid_replace_ena) {
NL_SET_ERR_MSG_MOD(extack,
"Cannot change VLAN state with vlan modify rules active");
net: switchdev: remove the transaction structure from port attributes Since the introduction of the switchdev API, port attributes were transmitted to drivers for offloading using a two-step transactional model, with a prepare phase that was supposed to catch all errors, and a commit phase that was supposed to never fail. Some classes of failures can never be avoided, like hardware access, or memory allocation. In the latter case, merely attempting to move the memory allocation to the preparation phase makes it impossible to avoid memory leaks, since commit 91cf8eceffc1 ("switchdev: Remove unused transaction item queue") which has removed the unused mechanism of passing on the allocated memory between one phase and another. It is time we admit that separating the preparation from the commit phase is something that is best left for the driver to decide, and not something that should be baked into the API, especially since there are no switchdev callers that depend on this. This patch removes the struct switchdev_trans member from switchdev port attribute notifier structures, and converts drivers to not look at this member. In part, this patch contains a revert of my previous commit 2e554a7a5d8a ("net: dsa: propagate switchdev vlan_filtering prepare phase to drivers"). For the most part, the conversion was trivial except for: - Rocker's world implementation based on Broadcom OF-DPA had an odd implementation of ofdpa_port_attr_bridge_flags_set. The conversion was done mechanically, by pasting the implementation twice, then only keeping the code that would get executed during prepare phase on top, then only keeping the code that gets executed during the commit phase on bottom, then simplifying the resulting code until this was obtained. - DSA's offloading of STP state, bridge flags, VLAN filtering and multicast router could be converted right away. But the ageing time could not, so a shim was introduced and this was left for a further commit. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Linus Walleij <linus.walleij@linaro.org> Acked-by: Jiri Pirko <jiri@nvidia.com> Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de> # hellcreek Reviewed-by: Linus Walleij <linus.walleij@linaro.org> # RTL8366RB Reviewed-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-09 00:01:50 +00:00
return -EBUSY;
net: mscc: ocelot: offload VLAN mangle action to VCAP IS1 The VCAP_IS1_ACT_VID_REPLACE_ENA action, from the VCAP IS1 ingress TCAM, changes the classified VLAN. We are only exposing this ability for switch ports that are under VLAN aware bridges. This is because in standalone ports mode and under a bridge with vlan_filtering=0, the ocelot driver configures the switch to operate as VLAN-unaware, so the classified VLAN is not derived from the 802.1Q header from the packet, but instead is always equal to the port-based VLAN ID of the ingress port. We _can_ still change the classified VLAN for packets when operating in this mode, but the end result will most likely be a drop, since both the ingress and the egress port need to be members of the modified VLAN. And even if we install the new classified VLAN into the VLAN table of the switch, the result would still not be as expected: we wouldn't see, on the output port, the modified VLAN tag, but the original one, even though the classified VLAN was indeed modified. This is because of how the hardware works: on egress, what is pushed to the frame is a "port tag", which gives us the following options: - Tag all frames with port tag (derived from the classified VLAN) - Tag all frames with port tag, except if the classified VLAN is 0 or equal to the native VLAN of the egress port - No port tag Needless to say, in VLAN-unaware mode we are disabling the port tag. Otherwise, the existing VLAN tag would be ignored, and a second VLAN tag (the port tag), holding the classified VLAN, would be pushed (instead of replacing the existing 802.1Q tag). This is definitely not what the user wanted when installing a "vlan modify" action. So it is simply not worth bothering with VLAN modify rules under other configurations except when the ports are fully VLAN-aware. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-08 11:56:58 +00:00
}
}
net: dsa: propagate switchdev vlan_filtering prepare phase to drivers A driver may refuse to enable VLAN filtering for any reason beyond what the DSA framework cares about, such as: - having tc-flower rules that rely on the switch being VLAN-aware - the particular switch does not support VLAN, even if the driver does (the DSA framework just checks for the presence of the .port_vlan_add and .port_vlan_del pointers) - simply not supporting this configuration to be toggled at runtime Currently, when a driver rejects a configuration it cannot support, it does this from the commit phase, which triggers various warnings in switchdev. So propagate the prepare phase to drivers, to give them the ability to refuse invalid configurations cleanly and avoid the warnings. Since we need to modify all function prototypes and check for the prepare phase from within the drivers, take that opportunity and move the existing driver restrictions within the prepare phase where that is possible and easy. Cc: Florian Fainelli <f.fainelli@gmail.com> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com> Cc: Hauke Mehrtens <hauke@hauke-m.de> Cc: Woojung Huh <woojung.huh@microchip.com> Cc: Microchip Linux Driver Support <UNGLinuxDriver@microchip.com> Cc: Sean Wang <sean.wang@mediatek.com> Cc: Landen Chao <Landen.Chao@mediatek.com> Cc: Andrew Lunn <andrew@lunn.ch> Cc: Vivien Didelot <vivien.didelot@gmail.com> Cc: Jonathan McDowell <noodles@earth.li> Cc: Linus Walleij <linus.walleij@linaro.org> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-10-02 22:06:46 +00:00
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
err = ocelot_single_vlan_aware_bridge(ocelot, extack);
if (err)
return err;
if (vlan_aware)
err = ocelot_del_vlan_unaware_pvid(ocelot, port,
ocelot_port->bridge);
net: mscc: ocelot: don't add VID 0 to ocelot->vlans when leaving VLAN-aware bridge DSA, through dsa_port_bridge_leave(), first notifies the port of the fact that it left a bridge, then, if that bridge was VLAN-aware, it notifies the port of the change in VLAN awareness state, towards VLAN-unaware mode. So ocelot_port_vlan_filtering() can be called when ocelot_port->bridge is NULL, and this makes ocelot_add_vlan_unaware_pvid() create a struct ocelot_bridge_vlan with a vid of 0 and an "untagged" setting of true on that port. In a way this structure correctly reflects the reality, but by design, VID 0 (OCELOT_STANDALONE_PVID) was not meant to be kept in the bridge VLAN list of the driver, but managed separately. Having OCELOT_STANDALONE_PVID in ocelot->vlans makes us trip up on several sanity checks that did not expect to have this VID there. For example, after we leave a VLAN-aware bridge and we re-join it, we can no longer program egress-tagged VLANs to hardware: # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up # ip link set swp0 master br0 # ip link set swp0 nomaster # ip link set swp0 master br0 # bridge vlan add dev swp0 vid 100 Error: mscc_ocelot_switch_lib: Port with more than one egress-untagged VLAN cannot have egress-tagged VLANs. But this configuration is in fact supported by the hardware, since we could use OCELOT_PORT_TAG_NATIVE. According to its comment: /* all VLANs except the native VLAN and VID 0 are egress-tagged */ yet when assessing the eligibility for this mode, we do not check for VID 0 in ocelot_port_uses_native_vlan(), instead we just ensure that ocelot_port_num_untagged_vlans() == 1. This is simply because VID 0 doesn't have a bridge VLAN structure. The way I identify the problem is that ocelot_port_vlan_filtering(false) only means to call ocelot_add_vlan_unaware_pvid() when we dynamically turn off VLAN awareness for a bridge we are under, and the PVID changes from the bridge PVID to a reserved PVID based on the bridge number. Since OCELOT_STANDALONE_PVID is statically added to the VLAN table during ocelot_vlan_init() and never removed afterwards, calling ocelot_add_vlan_unaware_pvid() for it is not intended and does not serve any purpose. Fix the issue by avoiding the call to ocelot_add_vlan_unaware_pvid(vid=0) when we're resetting VLAN awareness after leaving the bridge, to become a standalone port. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-21 23:01:05 +00:00
else if (ocelot_port->bridge)
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
err = ocelot_add_vlan_unaware_pvid(ocelot, port,
ocelot_port->bridge);
if (err)
return err;
net: mscc: ocelot: fix untagged packet drops when enslaving to vlan aware bridge To rehash a previous explanation given in commit 1c44ce560b4d ("net: mscc: ocelot: fix vlan_filtering when enslaving to bridge before link is up"), the switch driver operates the in a mode where a single VLAN can be transmitted as untagged on a particular egress port. That is the "native VLAN on trunk port" use case. The configuration for this native VLAN is driven in 2 ways: - Set the egress port rewriter to strip the VLAN tag for the native VID (as it is egress-untagged, after all). - Configure the ingress port to drop untagged and priority-tagged traffic, if there is no native VLAN. The intention of this setting is that a trunk port with no native VLAN should not accept untagged traffic. Since both of the above configurations for the native VLAN should only be done if VLAN awareness is requested, they are actually done from the ocelot_port_vlan_filtering function, after the basic procedure of toggling the VLAN awareness flag of the port. But there's a problem with that simplistic approach: we are trying to juggle with 2 independent variables from a single function: - Native VLAN of the port - its value is held in port->vid. - VLAN awareness state of the port - currently there are some issues here, more on that later*. The actual problem can be seen when enslaving the switch ports to a VLAN filtering bridge: 0. The driver configures a pvid of zero for each port, when in standalone mode. While the bridge configures a default_pvid of 1 for each port that gets added as a slave to it. 1. The bridge calls ocelot_port_vlan_filtering with vlan_aware=true. The VLAN-filtering-dependent portion of the native VLAN configuration is done, considering that the native VLAN is 0. 2. The bridge calls ocelot_vlan_add with vid=1, pvid=true, untagged=true. The native VLAN changes to 1 (change which gets propagated to hardware). 3. ??? - nobody calls ocelot_port_vlan_filtering again, to reapply the VLAN-filtering-dependent portion of the native VLAN configuration, for the new native VLAN of 1. One can notice that after toggling "ip link set dev br0 type bridge vlan_filtering 0 && ip link set dev br0 type bridge vlan_filtering 1", the new native VLAN finally makes it through and untagged traffic finally starts flowing again. But obviously that shouldn't be needed. So it is clear that 2 independent variables need to both re-trigger the native VLAN configuration. So we introduce the second variable as ocelot_port->vlan_aware. *Actually both the DSA Felix driver and the Ocelot driver already had each its own variable: - Ocelot: ocelot_port_private->vlan_aware - Felix: dsa_port->vlan_filtering but the common Ocelot library needs to work with a single, common, variable, so there is some refactoring done to move the vlan_aware property from the private structure into the common ocelot_port structure. Fixes: 97bb69e1e36e ("net: mscc: ocelot: break apart ocelot_vlan_port_apply") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-14 19:36:15 +00:00
ocelot_port->vlan_aware = vlan_aware;
net: mscc: ocelot: fix untagged packet drops when enslaving to vlan aware bridge To rehash a previous explanation given in commit 1c44ce560b4d ("net: mscc: ocelot: fix vlan_filtering when enslaving to bridge before link is up"), the switch driver operates the in a mode where a single VLAN can be transmitted as untagged on a particular egress port. That is the "native VLAN on trunk port" use case. The configuration for this native VLAN is driven in 2 ways: - Set the egress port rewriter to strip the VLAN tag for the native VID (as it is egress-untagged, after all). - Configure the ingress port to drop untagged and priority-tagged traffic, if there is no native VLAN. The intention of this setting is that a trunk port with no native VLAN should not accept untagged traffic. Since both of the above configurations for the native VLAN should only be done if VLAN awareness is requested, they are actually done from the ocelot_port_vlan_filtering function, after the basic procedure of toggling the VLAN awareness flag of the port. But there's a problem with that simplistic approach: we are trying to juggle with 2 independent variables from a single function: - Native VLAN of the port - its value is held in port->vid. - VLAN awareness state of the port - currently there are some issues here, more on that later*. The actual problem can be seen when enslaving the switch ports to a VLAN filtering bridge: 0. The driver configures a pvid of zero for each port, when in standalone mode. While the bridge configures a default_pvid of 1 for each port that gets added as a slave to it. 1. The bridge calls ocelot_port_vlan_filtering with vlan_aware=true. The VLAN-filtering-dependent portion of the native VLAN configuration is done, considering that the native VLAN is 0. 2. The bridge calls ocelot_vlan_add with vid=1, pvid=true, untagged=true. The native VLAN changes to 1 (change which gets propagated to hardware). 3. ??? - nobody calls ocelot_port_vlan_filtering again, to reapply the VLAN-filtering-dependent portion of the native VLAN configuration, for the new native VLAN of 1. One can notice that after toggling "ip link set dev br0 type bridge vlan_filtering 0 && ip link set dev br0 type bridge vlan_filtering 1", the new native VLAN finally makes it through and untagged traffic finally starts flowing again. But obviously that shouldn't be needed. So it is clear that 2 independent variables need to both re-trigger the native VLAN configuration. So we introduce the second variable as ocelot_port->vlan_aware. *Actually both the DSA Felix driver and the Ocelot driver already had each its own variable: - Ocelot: ocelot_port_private->vlan_aware - Felix: dsa_port->vlan_filtering but the common Ocelot library needs to work with a single, common, variable, so there is some refactoring done to move the vlan_aware property from the private structure into the common ocelot_port structure. Fixes: 97bb69e1e36e ("net: mscc: ocelot: break apart ocelot_vlan_port_apply") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-14 19:36:15 +00:00
if (vlan_aware)
val = ANA_PORT_VLAN_CFG_VLAN_AWARE_ENA |
ANA_PORT_VLAN_CFG_VLAN_POP_CNT(1);
else
val = 0;
ocelot_rmw_gix(ocelot, val,
ANA_PORT_VLAN_CFG_VLAN_AWARE_ENA |
ANA_PORT_VLAN_CFG_VLAN_POP_CNT_M,
ANA_PORT_VLAN_CFG, port);
ocelot_port_set_pvid(ocelot, port, ocelot_port->pvid_vlan);
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
ocelot_port_manage_port_tag(ocelot, port);
net: dsa: propagate switchdev vlan_filtering prepare phase to drivers A driver may refuse to enable VLAN filtering for any reason beyond what the DSA framework cares about, such as: - having tc-flower rules that rely on the switch being VLAN-aware - the particular switch does not support VLAN, even if the driver does (the DSA framework just checks for the presence of the .port_vlan_add and .port_vlan_del pointers) - simply not supporting this configuration to be toggled at runtime Currently, when a driver rejects a configuration it cannot support, it does this from the commit phase, which triggers various warnings in switchdev. So propagate the prepare phase to drivers, to give them the ability to refuse invalid configurations cleanly and avoid the warnings. Since we need to modify all function prototypes and check for the prepare phase from within the drivers, take that opportunity and move the existing driver restrictions within the prepare phase where that is possible and easy. Cc: Florian Fainelli <f.fainelli@gmail.com> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com> Cc: Hauke Mehrtens <hauke@hauke-m.de> Cc: Woojung Huh <woojung.huh@microchip.com> Cc: Microchip Linux Driver Support <UNGLinuxDriver@microchip.com> Cc: Sean Wang <sean.wang@mediatek.com> Cc: Landen Chao <Landen.Chao@mediatek.com> Cc: Andrew Lunn <andrew@lunn.ch> Cc: Vivien Didelot <vivien.didelot@gmail.com> Cc: Jonathan McDowell <noodles@earth.li> Cc: Linus Walleij <linus.walleij@linaro.org> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-10-02 22:06:46 +00:00
return 0;
}
net: mscc: ocelot: fix untagged packet drops when enslaving to vlan aware bridge To rehash a previous explanation given in commit 1c44ce560b4d ("net: mscc: ocelot: fix vlan_filtering when enslaving to bridge before link is up"), the switch driver operates the in a mode where a single VLAN can be transmitted as untagged on a particular egress port. That is the "native VLAN on trunk port" use case. The configuration for this native VLAN is driven in 2 ways: - Set the egress port rewriter to strip the VLAN tag for the native VID (as it is egress-untagged, after all). - Configure the ingress port to drop untagged and priority-tagged traffic, if there is no native VLAN. The intention of this setting is that a trunk port with no native VLAN should not accept untagged traffic. Since both of the above configurations for the native VLAN should only be done if VLAN awareness is requested, they are actually done from the ocelot_port_vlan_filtering function, after the basic procedure of toggling the VLAN awareness flag of the port. But there's a problem with that simplistic approach: we are trying to juggle with 2 independent variables from a single function: - Native VLAN of the port - its value is held in port->vid. - VLAN awareness state of the port - currently there are some issues here, more on that later*. The actual problem can be seen when enslaving the switch ports to a VLAN filtering bridge: 0. The driver configures a pvid of zero for each port, when in standalone mode. While the bridge configures a default_pvid of 1 for each port that gets added as a slave to it. 1. The bridge calls ocelot_port_vlan_filtering with vlan_aware=true. The VLAN-filtering-dependent portion of the native VLAN configuration is done, considering that the native VLAN is 0. 2. The bridge calls ocelot_vlan_add with vid=1, pvid=true, untagged=true. The native VLAN changes to 1 (change which gets propagated to hardware). 3. ??? - nobody calls ocelot_port_vlan_filtering again, to reapply the VLAN-filtering-dependent portion of the native VLAN configuration, for the new native VLAN of 1. One can notice that after toggling "ip link set dev br0 type bridge vlan_filtering 0 && ip link set dev br0 type bridge vlan_filtering 1", the new native VLAN finally makes it through and untagged traffic finally starts flowing again. But obviously that shouldn't be needed. So it is clear that 2 independent variables need to both re-trigger the native VLAN configuration. So we introduce the second variable as ocelot_port->vlan_aware. *Actually both the DSA Felix driver and the Ocelot driver already had each its own variable: - Ocelot: ocelot_port_private->vlan_aware - Felix: dsa_port->vlan_filtering but the common Ocelot library needs to work with a single, common, variable, so there is some refactoring done to move the vlan_aware property from the private structure into the common ocelot_port structure. Fixes: 97bb69e1e36e ("net: mscc: ocelot: break apart ocelot_vlan_port_apply") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Horatiu Vultur <horatiu.vultur@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-14 19:36:15 +00:00
EXPORT_SYMBOL(ocelot_port_vlan_filtering);
int ocelot_vlan_prepare(struct ocelot *ocelot, int port, u16 vid, bool pvid,
bool untagged, struct netlink_ext_ack *extack)
{
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
if (untagged) {
/* We are adding an egress-tagged VLAN */
if (ocelot_port_uses_native_vlan(ocelot, port)) {
NL_SET_ERR_MSG_MOD(extack,
"Port with egress-tagged VLANs cannot have more than one egress-untagged (native) VLAN");
return -EBUSY;
}
} else {
/* We are adding an egress-tagged VLAN */
if (ocelot_port_num_untagged_vlans(ocelot, port) > 1) {
NL_SET_ERR_MSG_MOD(extack,
"Port with more than one egress-untagged VLAN cannot have egress-tagged VLANs");
return -EBUSY;
}
}
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (vid > OCELOT_RSV_VLAN_RANGE_START) {
NL_SET_ERR_MSG_MOD(extack,
"VLAN range 4000-4095 reserved for VLAN-unaware bridging");
return -EBUSY;
}
return 0;
}
EXPORT_SYMBOL(ocelot_vlan_prepare);
int ocelot_vlan_add(struct ocelot *ocelot, int port, u16 vid, bool pvid,
bool untagged)
{
int err;
net: mscc: ocelot: ignore VID 0 added by 8021q module Both the felix DSA driver and ocelot switchdev driver declare dev->features & NETIF_F_HW_VLAN_CTAG_FILTER under certain circumstances*, so the 8021q module will add VID 0 to our RX filter when the port goes up, to ensure 802.1p traffic is not dropped. We treat VID 0 as a special value (OCELOT_STANDALONE_PVID) which deliberately does not have a struct ocelot_bridge_vlan associated with it. Instead, this gets programmed to the VLAN table in ocelot_vlan_init(). If we allow external calls to modify VID 0, we reach the following situation: # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up # ip link set swp0 master br0 # ip link set swp0 up # this adds VID 0 to ocelot->vlans with untagged=false bridge vlan port vlan-id swp0 1 PVID Egress Untagged # the bridge also adds VID 1 br0 1 PVID Egress Untagged # bridge vlan add dev swp0 vid 100 untagged Error: mscc_ocelot_switch_lib: Port with egress-tagged VLANs cannot have more than one egress-untagged (native) VLAN. This configuration should have been accepted, because ocelot_port_manage_port_tag() should select OCELOT_PORT_TAG_NATIVE. Yet it isn't, because we have an entry in ocelot->vlans which says VID 0 should be egress-tagged, something the hardware can't do. Fix this by suppressing additions/deletions on VID 0 and managing this VLAN exclusively using OCELOT_STANDALONE_PVID. *DSA toggles it when the port becomes VLAN-aware by joining a VLAN-aware bridge. Ocelot declares it unconditionally for some reason. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-21 23:01:04 +00:00
/* Ignore VID 0 added to our RX filter by the 8021q module, since
* that collides with OCELOT_STANDALONE_PVID and changes it from
* egress-untagged to egress-tagged.
*/
if (!vid)
return 0;
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
err = ocelot_vlan_member_add(ocelot, port, vid, untagged);
if (err)
return err;
/* Default ingress vlan classification */
if (pvid)
ocelot_port_set_pvid(ocelot, port,
ocelot_bridge_vlan_find(ocelot, vid));
/* Untagged egress vlan clasification */
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
ocelot_port_manage_port_tag(ocelot, port);
return 0;
}
EXPORT_SYMBOL(ocelot_vlan_add);
int ocelot_vlan_del(struct ocelot *ocelot, int port, u16 vid)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
bool del_pvid = false;
int err;
net: mscc: ocelot: ignore VID 0 added by 8021q module Both the felix DSA driver and ocelot switchdev driver declare dev->features & NETIF_F_HW_VLAN_CTAG_FILTER under certain circumstances*, so the 8021q module will add VID 0 to our RX filter when the port goes up, to ensure 802.1p traffic is not dropped. We treat VID 0 as a special value (OCELOT_STANDALONE_PVID) which deliberately does not have a struct ocelot_bridge_vlan associated with it. Instead, this gets programmed to the VLAN table in ocelot_vlan_init(). If we allow external calls to modify VID 0, we reach the following situation: # ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up # ip link set swp0 master br0 # ip link set swp0 up # this adds VID 0 to ocelot->vlans with untagged=false bridge vlan port vlan-id swp0 1 PVID Egress Untagged # the bridge also adds VID 1 br0 1 PVID Egress Untagged # bridge vlan add dev swp0 vid 100 untagged Error: mscc_ocelot_switch_lib: Port with egress-tagged VLANs cannot have more than one egress-untagged (native) VLAN. This configuration should have been accepted, because ocelot_port_manage_port_tag() should select OCELOT_PORT_TAG_NATIVE. Yet it isn't, because we have an entry in ocelot->vlans which says VID 0 should be egress-tagged, something the hardware can't do. Fix this by suppressing additions/deletions on VID 0 and managing this VLAN exclusively using OCELOT_STANDALONE_PVID. *DSA toggles it when the port becomes VLAN-aware by joining a VLAN-aware bridge. Ocelot declares it unconditionally for some reason. Fixes: 54c319846086 ("net: mscc: ocelot: enforce FDB isolation when VLAN-unaware") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-04-21 23:01:04 +00:00
if (!vid)
return 0;
if (ocelot_port->pvid_vlan && ocelot_port->pvid_vlan->vid == vid)
del_pvid = true;
err = ocelot_vlan_member_del(ocelot, port, vid);
if (err)
return err;
net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion Currently, the ocelot_port_set_native_vlan() function starts dropping untagged and prio-tagged traffic when the native VLAN is removed? What is the native VLAN? It is the only egress-untagged VLAN that ocelot supports on a port. If the port is a trunk with 100 VLANs, one of those VLANs can be transmitted as egress-untagged, and that's the native VLAN. Is it wrong to drop untagged and prio-tagged traffic if there's no native VLAN? Yes and no. In this case, which is more typical, it's ok to apply that drop configuration: $ bridge vlan add dev swp0 vid 1 pvid untagged <- this is the native VLAN $ bridge vlan add dev swp0 vid 100 $ bridge vlan add dev swp0 vid 101 $ bridge vlan del dev swp0 vid 1 <- delete the native VLAN But only because the pvid and the native VLAN have the same ID. In this case, it isn't: $ bridge vlan add dev swp0 vid 1 pvid $ bridge vlan add dev swp0 vid 100 untagged <- this is the native VLAN $ bridge vlan del dev swp0 vid 101 $ bridge vlan del dev swp0 vid 100 <- delete the native VLAN It's wrong, because the switch will drop untagged and prio-tagged traffic now, despite having a valid pvid of 1. The confusion seems to stem from the fact that the native VLAN is an egress setting, while the PVID is an ingress setting. It would be correct to drop untagged and prio-tagged traffic only if there was no pvid on the port. So let's do just that. Background: https://lore.kernel.org/netdev/CA+h21hrRMrLH-RjBGhEJSTZd6_QPRSd3RkVRQF-wNKkrgKcRSA@mail.gmail.com/#t Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:14 +00:00
/* Ingress */
if (del_pvid)
ocelot_port_set_pvid(ocelot, port, NULL);
net: mscc: ocelot: move the logic to drop 802.1p traffic to the pvid deletion Currently, the ocelot_port_set_native_vlan() function starts dropping untagged and prio-tagged traffic when the native VLAN is removed? What is the native VLAN? It is the only egress-untagged VLAN that ocelot supports on a port. If the port is a trunk with 100 VLANs, one of those VLANs can be transmitted as egress-untagged, and that's the native VLAN. Is it wrong to drop untagged and prio-tagged traffic if there's no native VLAN? Yes and no. In this case, which is more typical, it's ok to apply that drop configuration: $ bridge vlan add dev swp0 vid 1 pvid untagged <- this is the native VLAN $ bridge vlan add dev swp0 vid 100 $ bridge vlan add dev swp0 vid 101 $ bridge vlan del dev swp0 vid 1 <- delete the native VLAN But only because the pvid and the native VLAN have the same ID. In this case, it isn't: $ bridge vlan add dev swp0 vid 1 pvid $ bridge vlan add dev swp0 vid 100 untagged <- this is the native VLAN $ bridge vlan del dev swp0 vid 101 $ bridge vlan del dev swp0 vid 100 <- delete the native VLAN It's wrong, because the switch will drop untagged and prio-tagged traffic now, despite having a valid pvid of 1. The confusion seems to stem from the fact that the native VLAN is an egress setting, while the PVID is an ingress setting. It would be correct to drop untagged and prio-tagged traffic only if there was no pvid on the port. So let's do just that. Background: https://lore.kernel.org/netdev/CA+h21hrRMrLH-RjBGhEJSTZd6_QPRSd3RkVRQF-wNKkrgKcRSA@mail.gmail.com/#t Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-31 10:29:14 +00:00
/* Egress */
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
ocelot_port_manage_port_tag(ocelot, port);
return 0;
}
EXPORT_SYMBOL(ocelot_vlan_del);
static void ocelot_vlan_init(struct ocelot *ocelot)
{
unsigned long all_ports = GENMASK(ocelot->num_phys_ports - 1, 0);
u16 port, vid;
/* Clear VLAN table, by default all ports are members of all VLANs */
ocelot_write(ocelot, ANA_TABLES_VLANACCESS_CMD_INIT,
ANA_TABLES_VLANACCESS);
ocelot_vlant_wait_for_completion(ocelot);
/* Configure the port VLAN memberships */
for (vid = 1; vid < VLAN_N_VID; vid++)
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
ocelot_vlant_set_mask(ocelot, vid, 0);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
/* We need VID 0 to get traffic on standalone ports.
* It is added automatically if the 8021q module is loaded, but we
* can't rely on that since it might not be.
*/
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
ocelot_vlant_set_mask(ocelot, OCELOT_STANDALONE_PVID, all_ports);
/* Set vlan ingress filter mask to all ports but the CPU port by
* default.
*/
ocelot_write(ocelot, all_ports, ANA_VLANMASK);
for (port = 0; port < ocelot->num_phys_ports; port++) {
ocelot_write_gix(ocelot, 0, REW_PORT_VLAN_CFG, port);
ocelot_write_gix(ocelot, 0, REW_TAG_CFG, port);
}
}
net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-08 17:36:27 +00:00
static u32 ocelot_read_eq_avail(struct ocelot *ocelot, int port)
{
return ocelot_read_rix(ocelot, QSYS_SW_STATUS, port);
}
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
static int ocelot_port_flush(struct ocelot *ocelot, int port)
net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-08 17:36:27 +00:00
{
unsigned int pause_ena;
net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-08 17:36:27 +00:00
int err, val;
/* Disable dequeuing from the egress queues */
ocelot_rmw_rix(ocelot, QSYS_PORT_MODE_DEQUEUE_DIS,
QSYS_PORT_MODE_DEQUEUE_DIS,
QSYS_PORT_MODE, port);
/* Disable flow control */
ocelot_fields_read(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, &pause_ena);
net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-08 17:36:27 +00:00
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 0);
/* Disable priority flow control */
ocelot_fields_write(ocelot, port,
QSYS_SWITCH_PORT_MODE_TX_PFC_ENA, 0);
/* Wait at least the time it takes to receive a frame of maximum length
* at the port.
* Worst-case delays for 10 kilobyte jumbo frames are:
* 8 ms on a 10M port
* 800 μs on a 100M port
* 80 μs on a 1G port
* 32 μs on a 2.5G port
*/
usleep_range(8000, 10000);
/* Disable half duplex backpressure. */
ocelot_rmw_rix(ocelot, 0, SYS_FRONT_PORT_MODE_HDX_MODE,
SYS_FRONT_PORT_MODE, port);
/* Flush the queues associated with the port. */
ocelot_rmw_gix(ocelot, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG_FLUSH_ENA,
REW_PORT_CFG, port);
/* Enable dequeuing from the egress queues. */
ocelot_rmw_rix(ocelot, 0, QSYS_PORT_MODE_DEQUEUE_DIS, QSYS_PORT_MODE,
port);
/* Wait until flushing is complete. */
err = read_poll_timeout(ocelot_read_eq_avail, val, !val,
100, 2000000, false, ocelot, port);
/* Clear flushing again. */
ocelot_rmw_gix(ocelot, 0, REW_PORT_CFG_FLUSH_ENA, REW_PORT_CFG, port);
/* Re-enable flow control */
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, pause_ena);
net: dsa: felix: implement port flushing on .phylink_mac_link_down There are several issues which may be seen when the link goes down while forwarding traffic, all of which can be attributed to the fact that the port flushing procedure from the reference manual was not closely followed. With flow control enabled on both the ingress port and the egress port, it may happen when a link goes down that Ethernet packets are in flight. In flow control mode, frames are held back and not dropped. When there is enough traffic in flight (example: iperf3 TCP), then the ingress port might enter congestion and never exit that state. This is a problem, because it is the egress port's link that went down, and that has caused the inability of the ingress port to send packets to any other port. This is solved by flushing the egress port's queues when it goes down. There is also a problem when performing stream splitting for IEEE 802.1CB traffic (not yet upstream, but a sort of multicast, basically). There, if one port from the destination ports mask goes down, splitting the stream towards the other destinations will no longer be performed. This can be traced down to this line: ocelot_port_writel(ocelot_port, 0, DEV_MAC_ENA_CFG); which should have been instead, as per the reference manual: ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA, DEV_MAC_ENA_CFG); Basically only DEV_MAC_ENA_CFG_RX_ENA should be disabled, but not DEV_MAC_ENA_CFG_TX_ENA - I don't have further insight into why that is the case, but apparently multicasting to several ports will cause issues if at least one of them doesn't have DEV_MAC_ENA_CFG_TX_ENA set. I am not sure what the state of the Ocelot VSC7514 driver is, but probably not as bad as Felix/Seville, since VSC7514 uses phylib and has the following in ocelot_adjust_link: if (!phydev->link) return; therefore the port is not really put down when the link is lost, unlike the DSA drivers which use .phylink_mac_link_down for that. Nonetheless, I put ocelot_port_flush() in the common ocelot.c because it needs to access some registers from drivers/net/ethernet/mscc/ocelot_rew.h which are not exported in include/soc/mscc/ and a bugfix patch should probably not move headers around. Fixes: bdeced75b13f ("net: dsa: felix: Add PCS operations for PHYLINK") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-08 17:36:27 +00:00
return err;
}
int ocelot_port_configure_serdes(struct ocelot *ocelot, int port,
struct device_node *portnp)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
struct device *dev = ocelot->dev;
int err;
/* Ensure clock signals and speed are set on all QSGMII links */
if (ocelot_port->phy_mode == PHY_INTERFACE_MODE_QSGMII)
ocelot_port_rmwl(ocelot_port, 0,
DEV_CLOCK_CFG_MAC_TX_RST |
DEV_CLOCK_CFG_MAC_RX_RST,
DEV_CLOCK_CFG);
if (ocelot_port->phy_mode != PHY_INTERFACE_MODE_INTERNAL) {
struct phy *serdes = of_phy_get(portnp, NULL);
if (IS_ERR(serdes)) {
err = PTR_ERR(serdes);
dev_err_probe(dev, err,
"missing SerDes phys for port %d\n",
port);
return err;
}
err = phy_set_mode_ext(serdes, PHY_MODE_ETHERNET,
ocelot_port->phy_mode);
of_phy_put(serdes);
if (err) {
dev_err(dev, "Could not SerDes mode on port %d: %pe\n",
port, ERR_PTR(err));
return err;
}
}
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_port_configure_serdes);
void ocelot_phylink_mac_config(struct ocelot *ocelot, int port,
unsigned int link_an_mode,
const struct phylink_link_state *state)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
/* Disable HDX fast control */
ocelot_port_writel(ocelot_port, DEV_PORT_MISC_HDX_FAST_DIS,
DEV_PORT_MISC);
/* SGMII only for now */
ocelot_port_writel(ocelot_port, PCS1G_MODE_CFG_SGMII_MODE_ENA,
PCS1G_MODE_CFG);
ocelot_port_writel(ocelot_port, PCS1G_SD_CFG_SD_SEL, PCS1G_SD_CFG);
/* Enable PCS */
ocelot_port_writel(ocelot_port, PCS1G_CFG_PCS_ENA, PCS1G_CFG);
/* No aneg on SGMII */
ocelot_port_writel(ocelot_port, 0, PCS1G_ANEG_CFG);
/* No loopback */
ocelot_port_writel(ocelot_port, 0, PCS1G_LB_CFG);
}
EXPORT_SYMBOL_GPL(ocelot_phylink_mac_config);
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
void ocelot_phylink_mac_link_down(struct ocelot *ocelot, int port,
unsigned int link_an_mode,
phy_interface_t interface,
unsigned long quirks)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
int err;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_port->speed = SPEED_UNKNOWN;
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
ocelot_port_rmwl(ocelot_port, 0, DEV_MAC_ENA_CFG_RX_ENA,
DEV_MAC_ENA_CFG);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
if (ocelot->ops->cut_through_fwd) {
mutex_lock(&ocelot->fwd_domain_lock);
ocelot->ops->cut_through_fwd(ocelot);
mutex_unlock(&ocelot->fwd_domain_lock);
}
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
ocelot_fields_write(ocelot, port, QSYS_SWITCH_PORT_MODE_PORT_ENA, 0);
err = ocelot_port_flush(ocelot, port);
if (err)
dev_err(ocelot->dev, "failed to flush port %d: %d\n",
port, err);
/* Put the port in reset. */
if (interface != PHY_INTERFACE_MODE_QSGMII ||
!(quirks & OCELOT_QUIRK_QSGMII_PORTS_MUST_BE_UP))
ocelot_port_rmwl(ocelot_port,
DEV_CLOCK_CFG_MAC_TX_RST |
DEV_CLOCK_CFG_MAC_RX_RST,
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
DEV_CLOCK_CFG_MAC_TX_RST |
DEV_CLOCK_CFG_MAC_RX_RST,
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
DEV_CLOCK_CFG);
}
EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_down);
void ocelot_phylink_mac_link_up(struct ocelot *ocelot, int port,
struct phy_device *phydev,
unsigned int link_an_mode,
phy_interface_t interface,
int speed, int duplex,
bool tx_pause, bool rx_pause,
unsigned long quirks)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
int mac_speed, mode = 0;
u32 mac_fc_cfg;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_port->speed = speed;
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
/* The MAC might be integrated in systems where the MAC speed is fixed
* and it's the PCS who is performing the rate adaptation, so we have
* to write "1000Mbps" into the LINK_SPEED field of DEV_CLOCK_CFG
* (which is also its default value).
*/
if ((quirks & OCELOT_QUIRK_PCS_PERFORMS_RATE_ADAPTATION) ||
speed == SPEED_1000) {
mac_speed = OCELOT_SPEED_1000;
mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
} else if (speed == SPEED_2500) {
mac_speed = OCELOT_SPEED_2500;
mode = DEV_MAC_MODE_CFG_GIGA_MODE_ENA;
} else if (speed == SPEED_100) {
mac_speed = OCELOT_SPEED_100;
} else {
mac_speed = OCELOT_SPEED_10;
}
if (duplex == DUPLEX_FULL)
mode |= DEV_MAC_MODE_CFG_FDX_ENA;
ocelot_port_writel(ocelot_port, mode, DEV_MAC_MODE_CFG);
/* Take port out of reset by clearing the MAC_TX_RST, MAC_RX_RST and
* PORT_RST bits in DEV_CLOCK_CFG.
*/
ocelot_port_writel(ocelot_port, DEV_CLOCK_CFG_LINK_SPEED(mac_speed),
DEV_CLOCK_CFG);
switch (speed) {
case SPEED_10:
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_10);
break;
case SPEED_100:
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_100);
break;
case SPEED_1000:
case SPEED_2500:
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
mac_fc_cfg = SYS_MAC_FC_CFG_FC_LINK_SPEED(OCELOT_SPEED_1000);
break;
default:
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
dev_err(ocelot->dev, "Unsupported speed on port %d: %d\n",
port, speed);
return;
}
if (rx_pause)
mac_fc_cfg |= SYS_MAC_FC_CFG_RX_FC_ENA;
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
if (tx_pause)
mac_fc_cfg |= SYS_MAC_FC_CFG_TX_FC_ENA |
SYS_MAC_FC_CFG_PAUSE_VAL_CFG(0xffff) |
SYS_MAC_FC_CFG_FC_LATENCY_CFG(0x7) |
SYS_MAC_FC_CFG_ZERO_PAUSE_ENA;
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
/* Flow control. Link speed is only used here to evaluate the time
* specification in incoming pause frames.
*/
ocelot_write_rix(ocelot, mac_fc_cfg, SYS_MAC_FC_CFG, port);
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
ocelot_write_rix(ocelot, 0, ANA_POL_FLOWC, port);
net: mscc: ocelot: don't let phylink re-enable TX PAUSE on the NPI port Since commit b39648079db4 ("net: mscc: ocelot: disable flow control on NPI interface"), flow control should be disabled on the DSA CPU port when used in NPI mode. However, the commit blamed in the Fixes: tag below broke this, because it allowed felix_phylink_mac_link_up() to overwrite SYS_PAUSE_CFG_PAUSE_ENA for the DSA CPU port. This issue became noticeable since the device tree update from commit 8fcea7be5736 ("arm64: dts: ls1028a: mark internal links between Felix and ENETC as capable of flow control"). The solution is to check whether this is the currently configured NPI port from ocelot_phylink_mac_link_up(), and to not modify the statically disabled PAUSE frame transmission if it is. When the port is configured for lossless mode as opposed to tail drop mode, but the link partner (DSA master) doesn't observe the transmitted PAUSE frames, the switch termination throughput is much worse, as can be seen below. Before: root@debian:~# iperf3 -c 192.168.100.2 Connecting to host 192.168.100.2, port 5201 [ 5] local 192.168.100.1 port 37504 connected to 192.168.100.2 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 28.4 MBytes 238 Mbits/sec 357 22.6 KBytes [ 5] 1.00-2.00 sec 33.6 MBytes 282 Mbits/sec 426 19.8 KBytes [ 5] 2.00-3.00 sec 34.0 MBytes 285 Mbits/sec 343 21.2 KBytes [ 5] 3.00-4.00 sec 32.9 MBytes 276 Mbits/sec 354 22.6 KBytes [ 5] 4.00-5.00 sec 32.3 MBytes 271 Mbits/sec 297 18.4 KBytes ^C[ 5] 5.00-5.06 sec 2.05 MBytes 270 Mbits/sec 45 19.8 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-5.06 sec 163 MBytes 271 Mbits/sec 1822 sender [ 5] 0.00-5.06 sec 0.00 Bytes 0.00 bits/sec receiver After: root@debian:~# iperf3 -c 192.168.100.2 Connecting to host 192.168.100.2, port 5201 [ 5] local 192.168.100.1 port 49470 connected to 192.168.100.2 port 5201 [ ID] Interval Transfer Bitrate Retr Cwnd [ 5] 0.00-1.00 sec 112 MBytes 941 Mbits/sec 259 143 KBytes [ 5] 1.00-2.00 sec 110 MBytes 920 Mbits/sec 329 144 KBytes [ 5] 2.00-3.00 sec 112 MBytes 936 Mbits/sec 255 144 KBytes [ 5] 3.00-4.00 sec 110 MBytes 927 Mbits/sec 355 105 KBytes [ 5] 4.00-5.00 sec 110 MBytes 926 Mbits/sec 350 156 KBytes [ 5] 5.00-6.00 sec 110 MBytes 925 Mbits/sec 305 148 KBytes [ 5] 6.00-7.00 sec 110 MBytes 924 Mbits/sec 320 143 KBytes [ 5] 7.00-8.00 sec 110 MBytes 925 Mbits/sec 273 97.6 KBytes [ 5] 8.00-9.00 sec 109 MBytes 913 Mbits/sec 299 141 KBytes [ 5] 9.00-10.00 sec 110 MBytes 922 Mbits/sec 287 146 KBytes - - - - - - - - - - - - - - - - - - - - - - - - - [ ID] Interval Transfer Bitrate Retr [ 5] 0.00-10.00 sec 1.08 GBytes 926 Mbits/sec 3032 sender [ 5] 0.00-10.00 sec 1.08 GBytes 925 Mbits/sec receiver Fixes: de274be32cb2 ("net: dsa: felix: set TX flow control according to the phylink_mac_link_up resolution") Reported-by: Xiaoliang Yang <xiaoliang.yang_1@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-01-12 20:21:27 +00:00
/* Don't attempt to send PAUSE frames on the NPI port, it's broken */
if (port != ocelot->npi)
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA,
tx_pause);
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
/* Undo the effects of ocelot_phylink_mac_link_down:
* enable MAC module
*/
ocelot_port_writel(ocelot_port, DEV_MAC_ENA_CFG_RX_ENA |
DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
/* If the port supports cut-through forwarding, update the masks before
* enabling forwarding on the port.
*/
if (ocelot->ops->cut_through_fwd) {
mutex_lock(&ocelot->fwd_domain_lock);
ocelot->ops->cut_through_fwd(ocelot);
mutex_unlock(&ocelot->fwd_domain_lock);
}
/* Core: Enable port for frame transfer */
ocelot_fields_write(ocelot, port,
QSYS_SWITCH_PORT_MODE_PORT_ENA, 1);
}
net: mscc: ocelot: convert to phylink The felix DSA driver, which is a wrapper over the same hardware class as ocelot, is integrated with phylink, but ocelot is using the plain PHY library. It makes sense to bring together the two implementations, which is what this patch achieves. This is a large patch and hard to break up, but it does the following: The existing ocelot_adjust_link writes some registers, and felix_phylink_mac_link_up writes some registers, some of them are common, but both functions write to some registers to which the other doesn't. The main reasons for this are: - Felix switches so far have used an NXP PCS so they had no need to write the PCS1G registers that ocelot_adjust_link writes - Felix switches have the MAC fixed at 1G, so some of the MAC speed changes actually break the link and must be avoided. The naming conventions for the functions introduced in this patch are: - vsc7514_phylink_{mac_config,validate} are specific to the Ocelot instantiations and placed in ocelot_net.c which is built only for the ocelot switchdev driver. - ocelot_phylink_mac_link_{up,down} are shared between the ocelot switchdev driver and the felix DSA driver (they are put in the common lib). One by one, the registers written by ocelot_adjust_link are: DEV_MAC_MODE_CFG - felix_phylink_mac_link_up had no need to write this register since its out-of-reset value was fine and did not need changing. The write is moved to the common ocelot_phylink_mac_link_up and on felix it is guarded by a quirk bit that makes the written value identical with the out-of-reset one DEV_PORT_MISC - runtime invariant, was moved to vsc7514_phylink_mac_config PCS1G_MODE_CFG - same as above PCS1G_SD_CFG - same as above PCS1G_CFG - same as above PCS1G_ANEG_CFG - same as above PCS1G_LB_CFG - same as above DEV_MAC_ENA_CFG - both ocelot_adjust_link and ocelot_port_disable touched this. felix_phylink_mac_link_{up,down} also do. We go with what felix does and put it in ocelot_phylink_mac_link_up. DEV_CLOCK_CFG - ocelot_adjust_link and felix_phylink_mac_link_up both write this, but to different values. Move to the common ocelot_phylink_mac_link_up and make sure via the quirk that the old values are preserved for both. ANA_PFC_PFC_CFG - ocelot_adjust_link wrote this, felix_phylink_mac_link_up did not. Runtime invariant, speed does not matter since PFC is disabled via the RX_PFC_ENA bits which are cleared. Move to vsc7514_phylink_mac_config. QSYS_SWITCH_PORT_MODE_PORT_ENA - both ocelot_adjust_link and felix_phylink_mac_link_{up,down} wrote this. Ocelot also wrote this register from ocelot_port_disable. Keep what felix did, move in ocelot_phylink_mac_link_{up,down} and delete ocelot_port_disable. ANA_POL_FLOWC - same as above SYS_MAC_FC_CFG - same as above, except slight behavior change. Whereas ocelot always enabled RX and TX flow control, felix listened to phylink (for the most part, at least - see the 2500base-X comment). The registers which only felix_phylink_mac_link_up wrote are: SYS_PAUSE_CFG_PAUSE_ENA - this is why I am not sure that flow control worked on ocelot. Not it should, since the code is shared with felix where it does. ANA_PORT_PORT_CFG - this is a Frame Analyzer block register, phylink should be the one touching them, deleted. Other changes: - The old phylib registration code was in mscc_ocelot_init_ports. It is hard to work with 2 levels of indentation already in, and with hard to follow teardown logic. The new phylink registration code was moved inside ocelot_probe_port(), right between alloc_etherdev() and register_netdev(). It could not be done before (=> outside of) ocelot_probe_port() because ocelot_probe_port() allocates the struct ocelot_port which we then use to assign ocelot_port->phy_mode to. It is more preferable to me to have all PHY handling logic inside the same function. - On the same topic: struct ocelot_port_private :: serdes is only used in ocelot_port_open to set the SERDES protocol to Ethernet. This is logically a runtime invariant and can be done just once, when the port registers with phylink. We therefore don't even need to keep the serdes reference inside struct ocelot_port_private, or to use the devm variant of of_phy_get(). - Phylink needs a valid phy-mode for phylink_create() to succeed, and the existing device tree bindings in arch/mips/boot/dts/mscc/ocelot_pcb120.dts don't define one for the internal PHY ports. So we patch PHY_INTERFACE_MODE_NA into PHY_INTERFACE_MODE_INTERNAL. - There was a strategically placed: switch (priv->phy_mode) { case PHY_INTERFACE_MODE_NA: continue; which made the code skip the serdes initialization for the internal PHY ports. Frankly that is not all that obvious, so now we explicitly initialize the serdes under an "if" condition and not rely on code jumps, so everything is clearer. - There was a write of OCELOT_SPEED_1000 to DEV_CLOCK_CFG for QSGMII ports. Since that is in fact the default value for the register field DEV_CLOCK_CFG_LINK_SPEED, I can only guess the intention was to clear the adjacent fields, MAC_TX_RST and MAC_RX_RST, aka take the port out of reset, which does match the comment. I don't even want to know why this code is placed there, but if there is indeed an issue that all ports that share a QSGMII lane must all be up, then this logic is already buggy, since mscc_ocelot_init_ports iterates using for_each_available_child_of_node, so nobody prevents the user from putting a 'status = "disabled";' for some QSGMII ports which would break the driver's assumption. In any case, in the eventuality that I'm right, we would have yet another issue if ocelot_phylink_mac_link_down would reset those ports and that would be forbidden, so since the ocelot_adjust_link logic did not do that (maybe for a reason), add another quirk to preserve the old logic. The ocelot driver teardown goes through all ports in one fell swoop. When initialization of one port fails, the ocelot->ports[port] pointer for that is reset to NULL, and teardown is done only for non-NULL ports, so there is no reason to do partial teardowns, let the central mscc_ocelot_release_ports() do its job. Tested bind, unbind, rebind, link up, link down, speed change on mock-up hardware (modified the driver to probe on Felix VSC9959). Also regression tested the felix DSA driver. Could not test the Ocelot specific bits (PCS1G, SERDES, device tree bindings). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-08-15 01:47:48 +00:00
EXPORT_SYMBOL_GPL(ocelot_phylink_mac_link_up);
static int ocelot_rx_frame_word(struct ocelot *ocelot, u8 grp, bool ifh,
u32 *rval)
{
u32 bytes_valid, val;
val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
if (val == XTR_NOT_READY) {
if (ifh)
return -EIO;
do {
val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
} while (val == XTR_NOT_READY);
}
switch (val) {
case XTR_ABORT:
return -EIO;
case XTR_EOF_0:
case XTR_EOF_1:
case XTR_EOF_2:
case XTR_EOF_3:
case XTR_PRUNED:
bytes_valid = XTR_VALID_BYTES(val);
val = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
if (val == XTR_ESCAPE)
*rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
else
*rval = val;
return bytes_valid;
case XTR_ESCAPE:
*rval = ocelot_read_rix(ocelot, QS_XTR_RD, grp);
return 4;
default:
*rval = val;
return 4;
}
}
static int ocelot_xtr_poll_xfh(struct ocelot *ocelot, int grp, u32 *xfh)
{
int i, err = 0;
for (i = 0; i < OCELOT_TAG_LEN / 4; i++) {
err = ocelot_rx_frame_word(ocelot, grp, true, &xfh[i]);
if (err != 4)
return (err < 0) ? err : -EIO;
}
return 0;
}
void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
u64 timestamp)
{
struct skb_shared_hwtstamps *shhwtstamps;
u64 tod_in_ns, full_ts_in_ns;
struct timespec64 ts;
ocelot_ptp_gettime64(&ocelot->ptp_info, &ts);
tod_in_ns = ktime_set(ts.tv_sec, ts.tv_nsec);
if ((tod_in_ns & 0xffffffff) < timestamp)
full_ts_in_ns = (((tod_in_ns >> 32) - 1) << 32) |
timestamp;
else
full_ts_in_ns = (tod_in_ns & GENMASK_ULL(63, 32)) |
timestamp;
shhwtstamps = skb_hwtstamps(skb);
memset(shhwtstamps, 0, sizeof(struct skb_shared_hwtstamps));
shhwtstamps->hwtstamp = full_ts_in_ns;
}
EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);
int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
{
u64 timestamp, src_port, len;
u32 xfh[OCELOT_TAG_LEN / 4];
struct net_device *dev;
struct sk_buff *skb;
int sz, buf_len;
u32 val, *buf;
int err;
err = ocelot_xtr_poll_xfh(ocelot, grp, xfh);
if (err)
return err;
ocelot_xfh_get_src_port(xfh, &src_port);
ocelot_xfh_get_len(xfh, &len);
ocelot_xfh_get_rew_val(xfh, &timestamp);
if (WARN_ON(src_port >= ocelot->num_phys_ports))
return -EINVAL;
dev = ocelot->ops->port_to_netdev(ocelot, src_port);
if (!dev)
return -EINVAL;
skb = netdev_alloc_skb(dev, len);
if (unlikely(!skb)) {
netdev_err(dev, "Unable to allocate sk_buff\n");
return -ENOMEM;
}
buf_len = len - ETH_FCS_LEN;
buf = (u32 *)skb_put(skb, buf_len);
len = 0;
do {
sz = ocelot_rx_frame_word(ocelot, grp, false, &val);
if (sz < 0) {
err = sz;
goto out_free_skb;
}
*buf++ = val;
len += sz;
} while (len < buf_len);
/* Read the FCS */
sz = ocelot_rx_frame_word(ocelot, grp, false, &val);
if (sz < 0) {
err = sz;
goto out_free_skb;
}
/* Update the statistics if part of the FCS was read before */
len -= ETH_FCS_LEN - sz;
if (unlikely(dev->features & NETIF_F_RXFCS)) {
buf = (u32 *)skb_put(skb, ETH_FCS_LEN);
*buf = val;
}
if (ocelot->ptp)
ocelot_ptp_rx_timestamp(ocelot, skb, timestamp);
/* Everything we see on an interface that is in the HW bridge
* has already been forwarded.
*/
if (ocelot->ports[src_port]->bridge)
skb->offload_fwd_mark = 1;
skb->protocol = eth_type_trans(skb, dev);
*nskb = skb;
return 0;
out_free_skb:
kfree_skb(skb);
return err;
}
EXPORT_SYMBOL(ocelot_xtr_poll_frame);
bool ocelot_can_inject(struct ocelot *ocelot, int grp)
{
u32 val = ocelot_read(ocelot, QS_INJ_STATUS);
if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))))
return false;
if (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp)))
return false;
return true;
}
EXPORT_SYMBOL(ocelot_can_inject);
void ocelot_ifh_port_set(void *ifh, int port, u32 rew_op, u32 vlan_tag)
{
ocelot_ifh_set_bypass(ifh, 1);
ocelot_ifh_set_dest(ifh, BIT_ULL(port));
ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
if (vlan_tag)
ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
if (rew_op)
ocelot_ifh_set_rew_op(ifh, rew_op);
}
EXPORT_SYMBOL(ocelot_ifh_port_set);
void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
u32 rew_op, struct sk_buff *skb)
{
u32 ifh[OCELOT_TAG_LEN / 4] = {0};
unsigned int i, count, last;
ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb));
for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
count = DIV_ROUND_UP(skb->len, 4);
last = skb->len % 4;
for (i = 0; i < count; i++)
ocelot_write_rix(ocelot, ((u32 *)skb->data)[i], QS_INJ_WR, grp);
/* Add padding */
while (i < (OCELOT_BUFFER_CELL_SZ / 4)) {
ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp);
i++;
}
/* Indicate EOF and valid bytes in last word */
ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
QS_INJ_CTRL_VLD_BYTES(skb->len < OCELOT_BUFFER_CELL_SZ ? 0 : last) |
QS_INJ_CTRL_EOF,
QS_INJ_CTRL, grp);
/* Add dummy CRC */
ocelot_write_rix(ocelot, 0, QS_INJ_WR, grp);
skb_tx_timestamp(skb);
skb->dev->stats.tx_packets++;
skb->dev->stats.tx_bytes += skb->len;
}
EXPORT_SYMBOL(ocelot_port_inject_frame);
net: dsa: tag_ocelot_8021q: add support for PTP timestamping For TX timestamping, we use the felix_txtstamp method which is common with the regular (non-8021q) ocelot tagger. This method says that skb deferral is needed, prepares a timestamp request ID, and puts a clone of the skb in a queue waiting for the timestamp IRQ. felix_txtstamp is called by dsa_skb_tx_timestamp() just before the tagger's xmit method. In the tagger xmit, we divert the packets classified by dsa_skb_tx_timestamp() as PTP towards the MMIO-based injection registers, and we declare them as dead towards dsa_slave_xmit. If not PTP, we proceed with normal tag_8021q stuff. Then the timestamp IRQ fires, the clone queued up from felix_txtstamp is matched to the TX timestamp retrieved from the switch's FIFO based on the timestamp request ID, and the clone is delivered to the stack. On RX, thanks to the VCAP IS2 rule that redirects the frames with an EtherType for 1588 towards two destinations: - the CPU port module (for MMIO based extraction) and - if the "no XTR IRQ" workaround is in place, the dsa_8021q CPU port the relevant data path processing starts in the ptp_classify_raw BPF classifier installed by DSA in the RX data path (post tagger, which is completely unaware that it saw a PTP packet). This time we can't reuse the same implementation of .port_rxtstamp that also works with the default ocelot tagger. That is because felix_rxtstamp is given an skb with a freshly stripped DSA header, and it says "I don't need deferral for its RX timestamp, it's right in it, let me show you"; and it just points to the header right behind skb->data, from where it unpacks the timestamp and annotates the skb with it. The same thing cannot happen with tag_ocelot_8021q, because for one thing, the skb did not have an extraction frame header in the first place, but a VLAN tag with no timestamp information. So the code paths in felix_rxtstamp for the regular and 8021q tagger are completely independent. With tag_8021q, the timestamp must come from the packet's duplicate delivered to the CPU port module, but there is potentially complex logic to be handled [ and prone to reordering ] if we were to just start reading packets from the CPU port module, and try to match them to the one we received over Ethernet and which needs an RX timestamp. So we do something simple: we tell DSA "give me some time to think" (we request skb deferral by returning false from .port_rxtstamp) and we just drop the frame we got over Ethernet with no attempt to match it to anything - we just treat it as a notification that there's data to be processed from the CPU port module's queues. Then we proceed to read the packets from those, one by one, which we deliver up the stack, timestamped, using netif_rx - the same function that any driver would use anyway if it needed RX timestamp deferral. So the assumption is that we'll come across the PTP packet that triggered the CPU extraction notification eventually, but we don't know when exactly. Thanks to the VCAP IS2 trap/redirect rule and the exclusion of the CPU port module from the flooding replicators, only PTP frames should be present in the CPU port module's RX queues anyway. There is just one conflict between the VCAP IS2 trapping rule and the semantics of the BPF classifier. Namely, ptp_classify_raw() deems general messages as non-timestampable, but still, those are trapped to the CPU port module since they have an EtherType of ETH_P_1588. So, if the "no XTR IRQ" workaround is in place, we need to run another BPF classifier on the frames extracted over MMIO, to avoid duplicates being sent to the stack (once over Ethernet, once over MMIO). It doesn't look like it's possible to install VCAP IS2 rules based on keys extracted from the 1588 frame headers. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-02-13 22:38:01 +00:00
void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp)
{
while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp))
ocelot_read_rix(ocelot, QS_XTR_RD, grp);
}
EXPORT_SYMBOL(ocelot_drain_cpu_queue);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
int ocelot_fdb_add(struct ocelot *ocelot, int port, const unsigned char *addr,
u16 vid, const struct net_device *bridge)
{
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
return ocelot_mact_learn(ocelot, port, addr, vid, ENTRYTYPE_LOCKED);
}
EXPORT_SYMBOL(ocelot_fdb_add);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
int ocelot_fdb_del(struct ocelot *ocelot, int port, const unsigned char *addr,
u16 vid, const struct net_device *bridge)
{
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
return ocelot_mact_forget(ocelot, addr, vid);
}
EXPORT_SYMBOL(ocelot_fdb_del);
/* Caller must hold &ocelot->mact_lock */
static int ocelot_mact_read(struct ocelot *ocelot, int port, int row, int col,
struct ocelot_mact_entry *entry)
{
u32 val, dst, macl, mach;
char mac[ETH_ALEN];
/* Set row and column to read from */
ocelot_field_write(ocelot, ANA_TABLES_MACTINDX_M_INDEX, row);
ocelot_field_write(ocelot, ANA_TABLES_MACTINDX_BUCKET, col);
/* Issue a read command */
ocelot_write(ocelot,
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_READ),
ANA_TABLES_MACACCESS);
if (ocelot_mact_wait_for_completion(ocelot))
return -ETIMEDOUT;
/* Read the entry flags */
val = ocelot_read(ocelot, ANA_TABLES_MACACCESS);
if (!(val & ANA_TABLES_MACACCESS_VALID))
return -EINVAL;
/* If the entry read has another port configured as its destination,
* do not report it.
*/
dst = (val & ANA_TABLES_MACACCESS_DEST_IDX_M) >> 3;
if (dst != port)
return -EINVAL;
/* Get the entry's MAC address and VLAN id */
macl = ocelot_read(ocelot, ANA_TABLES_MACLDATA);
mach = ocelot_read(ocelot, ANA_TABLES_MACHDATA);
mac[0] = (mach >> 8) & 0xff;
mac[1] = (mach >> 0) & 0xff;
mac[2] = (macl >> 24) & 0xff;
mac[3] = (macl >> 16) & 0xff;
mac[4] = (macl >> 8) & 0xff;
mac[5] = (macl >> 0) & 0xff;
entry->vid = (mach >> 16) & 0xfff;
ether_addr_copy(entry->mac, mac);
return 0;
}
int ocelot_mact_flush(struct ocelot *ocelot, int port)
{
int err;
mutex_lock(&ocelot->mact_lock);
/* Program ageing filter for a single port */
ocelot_write(ocelot, ANA_ANAGEFIL_PID_EN | ANA_ANAGEFIL_PID_VAL(port),
ANA_ANAGEFIL);
/* Flushing dynamic FDB entries requires two successive age scans */
ocelot_write(ocelot,
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_AGE),
ANA_TABLES_MACACCESS);
err = ocelot_mact_wait_for_completion(ocelot);
if (err) {
mutex_unlock(&ocelot->mact_lock);
return err;
}
/* And second... */
ocelot_write(ocelot,
ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_AGE),
ANA_TABLES_MACACCESS);
err = ocelot_mact_wait_for_completion(ocelot);
/* Restore ageing filter */
ocelot_write(ocelot, 0, ANA_ANAGEFIL);
mutex_unlock(&ocelot->mact_lock);
return err;
}
EXPORT_SYMBOL_GPL(ocelot_mact_flush);
int ocelot_fdb_dump(struct ocelot *ocelot, int port,
dsa_fdb_dump_cb_t *cb, void *data)
{
int err = 0;
int i, j;
/* We could take the lock just around ocelot_mact_read, but doing so
* thousands of times in a row seems rather pointless and inefficient.
*/
mutex_lock(&ocelot->mact_lock);
/* Loop through all the mac tables entries. */
for (i = 0; i < ocelot->num_mact_rows; i++) {
for (j = 0; j < 4; j++) {
struct ocelot_mact_entry entry;
bool is_static;
err = ocelot_mact_read(ocelot, port, i, j, &entry);
/* If the entry is invalid (wrong port, invalid...),
* skip it.
*/
if (err == -EINVAL)
continue;
else if (err)
break;
is_static = (entry.type == ENTRYTYPE_LOCKED);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
/* Hide the reserved VLANs used for
* VLAN-unaware bridging.
*/
if (entry.vid > OCELOT_RSV_VLAN_RANGE_START)
entry.vid = 0;
err = cb(entry.mac, entry.vid, is_static, data);
if (err)
break;
}
}
mutex_unlock(&ocelot->mact_lock);
return err;
}
EXPORT_SYMBOL(ocelot_fdb_dump);
int ocelot_trap_add(struct ocelot *ocelot, int port,
unsigned long cookie, bool take_ts,
void (*populate)(struct ocelot_vcap_filter *f))
net: mscc: ocelot: set up traps for PTP packets IEEE 1588 support was declared too soon for the Ocelot switch. Out of reset, this switch does not apply any special treatment for PTP packets, i.e. when an event message is received, the natural tendency is to forward it by MAC DA/VLAN ID. This poses a problem when the ingress port is under a bridge, since user space application stacks (written primarily for endpoint ports, not switches) like ptp4l expect that PTP messages are always received on AF_PACKET / AF_INET sockets (depending on the PTP transport being used), and never being autonomously forwarded. Any forwarding, if necessary (for example in Transparent Clock mode) is handled in software by ptp4l. Having the hardware forward these packets too will cause duplicates which will confuse endpoints connected to these switches. So PTP over L2 barely works, in the sense that PTP packets reach the CPU port, but they reach it via flooding, and therefore reach lots of other unwanted destinations too. But PTP over IPv4/IPv6 does not work at all. This is because the Ocelot switch have a separate destination port mask for unknown IP multicast (which PTP over IP is) flooding compared to unknown non-IP multicast (which PTP over L2 is) flooding. Specifically, the driver allows the CPU port to be in the PGID_MC port group, but not in PGID_MCIPV4 and PGID_MCIPV6. There are several presentations from Allan Nielsen which explain that the embedded MIPS CPU on Ocelot switches is not very powerful at all, so every penny they could save by not allowing flooding to the CPU port module matters. Unknown IP multicast did not make it. The de facto consensus is that when a switch is PTP-aware and an application stack for PTP is running, switches should have some sort of trapping mechanism for PTP packets, to extract them from the hardware data path. This avoids both problems: (a) PTP packets are no longer flooded to unwanted destinations (b) PTP over IP packets are no longer denied from reaching the CPU since they arrive there via a trap and not via flooding It is not the first time when this change is attempted. Last time, the feedback from Allan Nielsen and Andrew Lunn was that the traps should not be installed by default, and that PTP-unaware switching may be desired for some use cases: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ To address that feedback, the present patch adds the necessary packet traps according to the RX filter configuration transmitted by user space through the SIOCSHWTSTAMP ioctl. Trapping is done via VCAP IS2, where we keep 5 filters, which are amended each time RX timestamping is enabled or disabled on a port: - 1 for PTP over L2 - 2 for PTP over IPv4 (UDP ports 319 and 320) - 2 for PTP over IPv6 (UDP ports 319 and 320) The cookie by which these filters (invisible to tc) are identified is strategically chosen such that it does not collide with the filters used for the ocelot-8021q tagging protocol by the Felix driver, or with the MRP traps set up by the Ocelot library. Other alternatives were considered, like patching user space to do something, but there are so many ways in which PTP packets could be made to reach the CPU, generically speaking, that "do what?" is a very valid question. The ptp4l program from the linuxptp stack already attempts to do something: it calls setsockopt(IP_ADD_MEMBERSHIP) (and PACKET_ADD_MEMBERSHIP, respectively) which translates in both cases into a dev_mc_add() on the interface, in the kernel: https://github.com/richardcochran/linuxptp/blob/v3.1.1/udp.c#L73 https://github.com/richardcochran/linuxptp/blob/v3.1.1/raw.c Reality shows that this is not sufficient in case the interface belongs to a switchdev driver, as dev_mc_add() does not show the intention to trap a packet to the CPU, but rather the intention to not drop it (it is strictly for RX filtering, same as promiscuous does not mean to send all traffic to the CPU, but to not drop traffic with unknown MAC DA). This topic is a can of worms in itself, and it would be great if user space could just stay out of it. On the other hand, setting up PTP traps privately within the driver is not new by any stretch of the imagination: https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c#L833 https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/dsa/hirschmann/hellcreek.c#L1050 https://elixir.bootlin.com/linux/v5.16-rc2/source/include/linux/dsa/sja1105.h#L21 So this is the approach taken here as well. The difference here being that we prepare and destroy the traps per port, dynamically at runtime, as opposed to driver init time, because apparently, PTP-unaware forwarding is a use case. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Reported-by: Po Liu <po.liu@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-26 17:28:44 +00:00
{
struct ocelot_vcap_block *block_vcap_is2;
struct ocelot_vcap_filter *trap;
bool new = false;
int err;
block_vcap_is2 = &ocelot->block[VCAP_IS2];
trap = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, cookie,
false);
if (!trap) {
trap = kzalloc(sizeof(*trap), GFP_KERNEL);
if (!trap)
return -ENOMEM;
populate(trap);
trap->prio = 1;
trap->id.cookie = cookie;
trap->id.tc_offload = false;
trap->block_id = VCAP_IS2;
trap->type = OCELOT_VCAP_FILTER_OFFLOAD;
trap->lookup = 0;
trap->action.cpu_copy_ena = true;
trap->action.mask_mode = OCELOT_MASK_MODE_PERMIT_DENY;
trap->action.port_mask = 0;
trap->take_ts = take_ts;
net: mscc: ocelot: mark traps with a bool instead of keeping them in a list Since the blamed commit, VCAP filters can appear on more than one list. If their action is "trap", they are chained on ocelot->traps via filter->trap_list. This is in addition to their normal placement on the VCAP block->rules list head. Therefore, when we free a VCAP filter, we must remove it from all lists it is a member of, including ocelot->traps. There are at least 2 bugs which are direct consequences of this design decision. First is the incorrect usage of list_empty(), meant to denote whether "filter" is chained into ocelot->traps via filter->trap_list. This does not do the correct thing, because list_empty() checks whether "head->next == head", but in our case, head->next == head->prev == NULL. So we dereference NULL pointers and die when we call list_del(). Second is the fact that not all places that should remove the filter from ocelot->traps do so. One example is ocelot_vcap_block_remove_filter(), which is where we have the main kfree(filter). By keeping freed filters in ocelot->traps we end up in a use-after-free in felix_update_trapping_destinations(). Attempting to fix all the buggy patterns is a whack-a-mole game which makes the driver unmaintainable. Actually this is what the previous patch version attempted to do: https://patchwork.kernel.org/project/netdevbpf/patch/20220503115728.834457-3-vladimir.oltean@nxp.com/ but it introduced another set of bugs, because there are other places in which create VCAP filters, not just ocelot_vcap_filter_create(): - ocelot_trap_add() - felix_tag_8021q_vlan_add_rx() - felix_tag_8021q_vlan_add_tx() Relying on the convention that all those code paths must call INIT_LIST_HEAD(&filter->trap_list) is not going to scale. So let's do what should have been done in the first place and keep a bool in struct ocelot_vcap_filter which denotes whether we are looking at a trapping rule or not. Iterating now happens over the main VCAP IS2 block->rules. The advantage is that we no longer risk having stale references to a freed filter, since it is only present in that list. Fixes: e42bd4ed09aa ("net: mscc: ocelot: keep traps in a list") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-05-04 23:54:59 +00:00
trap->is_trap = true;
net: mscc: ocelot: set up traps for PTP packets IEEE 1588 support was declared too soon for the Ocelot switch. Out of reset, this switch does not apply any special treatment for PTP packets, i.e. when an event message is received, the natural tendency is to forward it by MAC DA/VLAN ID. This poses a problem when the ingress port is under a bridge, since user space application stacks (written primarily for endpoint ports, not switches) like ptp4l expect that PTP messages are always received on AF_PACKET / AF_INET sockets (depending on the PTP transport being used), and never being autonomously forwarded. Any forwarding, if necessary (for example in Transparent Clock mode) is handled in software by ptp4l. Having the hardware forward these packets too will cause duplicates which will confuse endpoints connected to these switches. So PTP over L2 barely works, in the sense that PTP packets reach the CPU port, but they reach it via flooding, and therefore reach lots of other unwanted destinations too. But PTP over IPv4/IPv6 does not work at all. This is because the Ocelot switch have a separate destination port mask for unknown IP multicast (which PTP over IP is) flooding compared to unknown non-IP multicast (which PTP over L2 is) flooding. Specifically, the driver allows the CPU port to be in the PGID_MC port group, but not in PGID_MCIPV4 and PGID_MCIPV6. There are several presentations from Allan Nielsen which explain that the embedded MIPS CPU on Ocelot switches is not very powerful at all, so every penny they could save by not allowing flooding to the CPU port module matters. Unknown IP multicast did not make it. The de facto consensus is that when a switch is PTP-aware and an application stack for PTP is running, switches should have some sort of trapping mechanism for PTP packets, to extract them from the hardware data path. This avoids both problems: (a) PTP packets are no longer flooded to unwanted destinations (b) PTP over IP packets are no longer denied from reaching the CPU since they arrive there via a trap and not via flooding It is not the first time when this change is attempted. Last time, the feedback from Allan Nielsen and Andrew Lunn was that the traps should not be installed by default, and that PTP-unaware switching may be desired for some use cases: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ To address that feedback, the present patch adds the necessary packet traps according to the RX filter configuration transmitted by user space through the SIOCSHWTSTAMP ioctl. Trapping is done via VCAP IS2, where we keep 5 filters, which are amended each time RX timestamping is enabled or disabled on a port: - 1 for PTP over L2 - 2 for PTP over IPv4 (UDP ports 319 and 320) - 2 for PTP over IPv6 (UDP ports 319 and 320) The cookie by which these filters (invisible to tc) are identified is strategically chosen such that it does not collide with the filters used for the ocelot-8021q tagging protocol by the Felix driver, or with the MRP traps set up by the Ocelot library. Other alternatives were considered, like patching user space to do something, but there are so many ways in which PTP packets could be made to reach the CPU, generically speaking, that "do what?" is a very valid question. The ptp4l program from the linuxptp stack already attempts to do something: it calls setsockopt(IP_ADD_MEMBERSHIP) (and PACKET_ADD_MEMBERSHIP, respectively) which translates in both cases into a dev_mc_add() on the interface, in the kernel: https://github.com/richardcochran/linuxptp/blob/v3.1.1/udp.c#L73 https://github.com/richardcochran/linuxptp/blob/v3.1.1/raw.c Reality shows that this is not sufficient in case the interface belongs to a switchdev driver, as dev_mc_add() does not show the intention to trap a packet to the CPU, but rather the intention to not drop it (it is strictly for RX filtering, same as promiscuous does not mean to send all traffic to the CPU, but to not drop traffic with unknown MAC DA). This topic is a can of worms in itself, and it would be great if user space could just stay out of it. On the other hand, setting up PTP traps privately within the driver is not new by any stretch of the imagination: https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c#L833 https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/dsa/hirschmann/hellcreek.c#L1050 https://elixir.bootlin.com/linux/v5.16-rc2/source/include/linux/dsa/sja1105.h#L21 So this is the approach taken here as well. The difference here being that we prepare and destroy the traps per port, dynamically at runtime, as opposed to driver init time, because apparently, PTP-unaware forwarding is a use case. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Reported-by: Po Liu <po.liu@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-26 17:28:44 +00:00
new = true;
}
trap->ingress_port_mask |= BIT(port);
if (new)
err = ocelot_vcap_filter_add(ocelot, trap, NULL);
else
err = ocelot_vcap_filter_replace(ocelot, trap);
if (err) {
trap->ingress_port_mask &= ~BIT(port);
net: mscc: ocelot: mark traps with a bool instead of keeping them in a list Since the blamed commit, VCAP filters can appear on more than one list. If their action is "trap", they are chained on ocelot->traps via filter->trap_list. This is in addition to their normal placement on the VCAP block->rules list head. Therefore, when we free a VCAP filter, we must remove it from all lists it is a member of, including ocelot->traps. There are at least 2 bugs which are direct consequences of this design decision. First is the incorrect usage of list_empty(), meant to denote whether "filter" is chained into ocelot->traps via filter->trap_list. This does not do the correct thing, because list_empty() checks whether "head->next == head", but in our case, head->next == head->prev == NULL. So we dereference NULL pointers and die when we call list_del(). Second is the fact that not all places that should remove the filter from ocelot->traps do so. One example is ocelot_vcap_block_remove_filter(), which is where we have the main kfree(filter). By keeping freed filters in ocelot->traps we end up in a use-after-free in felix_update_trapping_destinations(). Attempting to fix all the buggy patterns is a whack-a-mole game which makes the driver unmaintainable. Actually this is what the previous patch version attempted to do: https://patchwork.kernel.org/project/netdevbpf/patch/20220503115728.834457-3-vladimir.oltean@nxp.com/ but it introduced another set of bugs, because there are other places in which create VCAP filters, not just ocelot_vcap_filter_create(): - ocelot_trap_add() - felix_tag_8021q_vlan_add_rx() - felix_tag_8021q_vlan_add_tx() Relying on the convention that all those code paths must call INIT_LIST_HEAD(&filter->trap_list) is not going to scale. So let's do what should have been done in the first place and keep a bool in struct ocelot_vcap_filter which denotes whether we are looking at a trapping rule or not. Iterating now happens over the main VCAP IS2 block->rules. The advantage is that we no longer risk having stale references to a freed filter, since it is only present in that list. Fixes: e42bd4ed09aa ("net: mscc: ocelot: keep traps in a list") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-05-04 23:54:59 +00:00
if (!trap->ingress_port_mask)
net: mscc: ocelot: set up traps for PTP packets IEEE 1588 support was declared too soon for the Ocelot switch. Out of reset, this switch does not apply any special treatment for PTP packets, i.e. when an event message is received, the natural tendency is to forward it by MAC DA/VLAN ID. This poses a problem when the ingress port is under a bridge, since user space application stacks (written primarily for endpoint ports, not switches) like ptp4l expect that PTP messages are always received on AF_PACKET / AF_INET sockets (depending on the PTP transport being used), and never being autonomously forwarded. Any forwarding, if necessary (for example in Transparent Clock mode) is handled in software by ptp4l. Having the hardware forward these packets too will cause duplicates which will confuse endpoints connected to these switches. So PTP over L2 barely works, in the sense that PTP packets reach the CPU port, but they reach it via flooding, and therefore reach lots of other unwanted destinations too. But PTP over IPv4/IPv6 does not work at all. This is because the Ocelot switch have a separate destination port mask for unknown IP multicast (which PTP over IP is) flooding compared to unknown non-IP multicast (which PTP over L2 is) flooding. Specifically, the driver allows the CPU port to be in the PGID_MC port group, but not in PGID_MCIPV4 and PGID_MCIPV6. There are several presentations from Allan Nielsen which explain that the embedded MIPS CPU on Ocelot switches is not very powerful at all, so every penny they could save by not allowing flooding to the CPU port module matters. Unknown IP multicast did not make it. The de facto consensus is that when a switch is PTP-aware and an application stack for PTP is running, switches should have some sort of trapping mechanism for PTP packets, to extract them from the hardware data path. This avoids both problems: (a) PTP packets are no longer flooded to unwanted destinations (b) PTP over IP packets are no longer denied from reaching the CPU since they arrive there via a trap and not via flooding It is not the first time when this change is attempted. Last time, the feedback from Allan Nielsen and Andrew Lunn was that the traps should not be installed by default, and that PTP-unaware switching may be desired for some use cases: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ To address that feedback, the present patch adds the necessary packet traps according to the RX filter configuration transmitted by user space through the SIOCSHWTSTAMP ioctl. Trapping is done via VCAP IS2, where we keep 5 filters, which are amended each time RX timestamping is enabled or disabled on a port: - 1 for PTP over L2 - 2 for PTP over IPv4 (UDP ports 319 and 320) - 2 for PTP over IPv6 (UDP ports 319 and 320) The cookie by which these filters (invisible to tc) are identified is strategically chosen such that it does not collide with the filters used for the ocelot-8021q tagging protocol by the Felix driver, or with the MRP traps set up by the Ocelot library. Other alternatives were considered, like patching user space to do something, but there are so many ways in which PTP packets could be made to reach the CPU, generically speaking, that "do what?" is a very valid question. The ptp4l program from the linuxptp stack already attempts to do something: it calls setsockopt(IP_ADD_MEMBERSHIP) (and PACKET_ADD_MEMBERSHIP, respectively) which translates in both cases into a dev_mc_add() on the interface, in the kernel: https://github.com/richardcochran/linuxptp/blob/v3.1.1/udp.c#L73 https://github.com/richardcochran/linuxptp/blob/v3.1.1/raw.c Reality shows that this is not sufficient in case the interface belongs to a switchdev driver, as dev_mc_add() does not show the intention to trap a packet to the CPU, but rather the intention to not drop it (it is strictly for RX filtering, same as promiscuous does not mean to send all traffic to the CPU, but to not drop traffic with unknown MAC DA). This topic is a can of worms in itself, and it would be great if user space could just stay out of it. On the other hand, setting up PTP traps privately within the driver is not new by any stretch of the imagination: https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c#L833 https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/dsa/hirschmann/hellcreek.c#L1050 https://elixir.bootlin.com/linux/v5.16-rc2/source/include/linux/dsa/sja1105.h#L21 So this is the approach taken here as well. The difference here being that we prepare and destroy the traps per port, dynamically at runtime, as opposed to driver init time, because apparently, PTP-unaware forwarding is a use case. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Reported-by: Po Liu <po.liu@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-26 17:28:44 +00:00
kfree(trap);
return err;
}
return 0;
}
int ocelot_trap_del(struct ocelot *ocelot, int port, unsigned long cookie)
net: mscc: ocelot: set up traps for PTP packets IEEE 1588 support was declared too soon for the Ocelot switch. Out of reset, this switch does not apply any special treatment for PTP packets, i.e. when an event message is received, the natural tendency is to forward it by MAC DA/VLAN ID. This poses a problem when the ingress port is under a bridge, since user space application stacks (written primarily for endpoint ports, not switches) like ptp4l expect that PTP messages are always received on AF_PACKET / AF_INET sockets (depending on the PTP transport being used), and never being autonomously forwarded. Any forwarding, if necessary (for example in Transparent Clock mode) is handled in software by ptp4l. Having the hardware forward these packets too will cause duplicates which will confuse endpoints connected to these switches. So PTP over L2 barely works, in the sense that PTP packets reach the CPU port, but they reach it via flooding, and therefore reach lots of other unwanted destinations too. But PTP over IPv4/IPv6 does not work at all. This is because the Ocelot switch have a separate destination port mask for unknown IP multicast (which PTP over IP is) flooding compared to unknown non-IP multicast (which PTP over L2 is) flooding. Specifically, the driver allows the CPU port to be in the PGID_MC port group, but not in PGID_MCIPV4 and PGID_MCIPV6. There are several presentations from Allan Nielsen which explain that the embedded MIPS CPU on Ocelot switches is not very powerful at all, so every penny they could save by not allowing flooding to the CPU port module matters. Unknown IP multicast did not make it. The de facto consensus is that when a switch is PTP-aware and an application stack for PTP is running, switches should have some sort of trapping mechanism for PTP packets, to extract them from the hardware data path. This avoids both problems: (a) PTP packets are no longer flooded to unwanted destinations (b) PTP over IP packets are no longer denied from reaching the CPU since they arrive there via a trap and not via flooding It is not the first time when this change is attempted. Last time, the feedback from Allan Nielsen and Andrew Lunn was that the traps should not be installed by default, and that PTP-unaware switching may be desired for some use cases: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ To address that feedback, the present patch adds the necessary packet traps according to the RX filter configuration transmitted by user space through the SIOCSHWTSTAMP ioctl. Trapping is done via VCAP IS2, where we keep 5 filters, which are amended each time RX timestamping is enabled or disabled on a port: - 1 for PTP over L2 - 2 for PTP over IPv4 (UDP ports 319 and 320) - 2 for PTP over IPv6 (UDP ports 319 and 320) The cookie by which these filters (invisible to tc) are identified is strategically chosen such that it does not collide with the filters used for the ocelot-8021q tagging protocol by the Felix driver, or with the MRP traps set up by the Ocelot library. Other alternatives were considered, like patching user space to do something, but there are so many ways in which PTP packets could be made to reach the CPU, generically speaking, that "do what?" is a very valid question. The ptp4l program from the linuxptp stack already attempts to do something: it calls setsockopt(IP_ADD_MEMBERSHIP) (and PACKET_ADD_MEMBERSHIP, respectively) which translates in both cases into a dev_mc_add() on the interface, in the kernel: https://github.com/richardcochran/linuxptp/blob/v3.1.1/udp.c#L73 https://github.com/richardcochran/linuxptp/blob/v3.1.1/raw.c Reality shows that this is not sufficient in case the interface belongs to a switchdev driver, as dev_mc_add() does not show the intention to trap a packet to the CPU, but rather the intention to not drop it (it is strictly for RX filtering, same as promiscuous does not mean to send all traffic to the CPU, but to not drop traffic with unknown MAC DA). This topic is a can of worms in itself, and it would be great if user space could just stay out of it. On the other hand, setting up PTP traps privately within the driver is not new by any stretch of the imagination: https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c#L833 https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/dsa/hirschmann/hellcreek.c#L1050 https://elixir.bootlin.com/linux/v5.16-rc2/source/include/linux/dsa/sja1105.h#L21 So this is the approach taken here as well. The difference here being that we prepare and destroy the traps per port, dynamically at runtime, as opposed to driver init time, because apparently, PTP-unaware forwarding is a use case. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Reported-by: Po Liu <po.liu@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-26 17:28:44 +00:00
{
struct ocelot_vcap_block *block_vcap_is2;
struct ocelot_vcap_filter *trap;
block_vcap_is2 = &ocelot->block[VCAP_IS2];
trap = ocelot_vcap_block_find_filter_by_id(block_vcap_is2, cookie,
false);
if (!trap)
return 0;
trap->ingress_port_mask &= ~BIT(port);
net: mscc: ocelot: mark traps with a bool instead of keeping them in a list Since the blamed commit, VCAP filters can appear on more than one list. If their action is "trap", they are chained on ocelot->traps via filter->trap_list. This is in addition to their normal placement on the VCAP block->rules list head. Therefore, when we free a VCAP filter, we must remove it from all lists it is a member of, including ocelot->traps. There are at least 2 bugs which are direct consequences of this design decision. First is the incorrect usage of list_empty(), meant to denote whether "filter" is chained into ocelot->traps via filter->trap_list. This does not do the correct thing, because list_empty() checks whether "head->next == head", but in our case, head->next == head->prev == NULL. So we dereference NULL pointers and die when we call list_del(). Second is the fact that not all places that should remove the filter from ocelot->traps do so. One example is ocelot_vcap_block_remove_filter(), which is where we have the main kfree(filter). By keeping freed filters in ocelot->traps we end up in a use-after-free in felix_update_trapping_destinations(). Attempting to fix all the buggy patterns is a whack-a-mole game which makes the driver unmaintainable. Actually this is what the previous patch version attempted to do: https://patchwork.kernel.org/project/netdevbpf/patch/20220503115728.834457-3-vladimir.oltean@nxp.com/ but it introduced another set of bugs, because there are other places in which create VCAP filters, not just ocelot_vcap_filter_create(): - ocelot_trap_add() - felix_tag_8021q_vlan_add_rx() - felix_tag_8021q_vlan_add_tx() Relying on the convention that all those code paths must call INIT_LIST_HEAD(&filter->trap_list) is not going to scale. So let's do what should have been done in the first place and keep a bool in struct ocelot_vcap_filter which denotes whether we are looking at a trapping rule or not. Iterating now happens over the main VCAP IS2 block->rules. The advantage is that we no longer risk having stale references to a freed filter, since it is only present in that list. Fixes: e42bd4ed09aa ("net: mscc: ocelot: keep traps in a list") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-05-04 23:54:59 +00:00
if (!trap->ingress_port_mask)
net: mscc: ocelot: set up traps for PTP packets IEEE 1588 support was declared too soon for the Ocelot switch. Out of reset, this switch does not apply any special treatment for PTP packets, i.e. when an event message is received, the natural tendency is to forward it by MAC DA/VLAN ID. This poses a problem when the ingress port is under a bridge, since user space application stacks (written primarily for endpoint ports, not switches) like ptp4l expect that PTP messages are always received on AF_PACKET / AF_INET sockets (depending on the PTP transport being used), and never being autonomously forwarded. Any forwarding, if necessary (for example in Transparent Clock mode) is handled in software by ptp4l. Having the hardware forward these packets too will cause duplicates which will confuse endpoints connected to these switches. So PTP over L2 barely works, in the sense that PTP packets reach the CPU port, but they reach it via flooding, and therefore reach lots of other unwanted destinations too. But PTP over IPv4/IPv6 does not work at all. This is because the Ocelot switch have a separate destination port mask for unknown IP multicast (which PTP over IP is) flooding compared to unknown non-IP multicast (which PTP over L2 is) flooding. Specifically, the driver allows the CPU port to be in the PGID_MC port group, but not in PGID_MCIPV4 and PGID_MCIPV6. There are several presentations from Allan Nielsen which explain that the embedded MIPS CPU on Ocelot switches is not very powerful at all, so every penny they could save by not allowing flooding to the CPU port module matters. Unknown IP multicast did not make it. The de facto consensus is that when a switch is PTP-aware and an application stack for PTP is running, switches should have some sort of trapping mechanism for PTP packets, to extract them from the hardware data path. This avoids both problems: (a) PTP packets are no longer flooded to unwanted destinations (b) PTP over IP packets are no longer denied from reaching the CPU since they arrive there via a trap and not via flooding It is not the first time when this change is attempted. Last time, the feedback from Allan Nielsen and Andrew Lunn was that the traps should not be installed by default, and that PTP-unaware switching may be desired for some use cases: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ To address that feedback, the present patch adds the necessary packet traps according to the RX filter configuration transmitted by user space through the SIOCSHWTSTAMP ioctl. Trapping is done via VCAP IS2, where we keep 5 filters, which are amended each time RX timestamping is enabled or disabled on a port: - 1 for PTP over L2 - 2 for PTP over IPv4 (UDP ports 319 and 320) - 2 for PTP over IPv6 (UDP ports 319 and 320) The cookie by which these filters (invisible to tc) are identified is strategically chosen such that it does not collide with the filters used for the ocelot-8021q tagging protocol by the Felix driver, or with the MRP traps set up by the Ocelot library. Other alternatives were considered, like patching user space to do something, but there are so many ways in which PTP packets could be made to reach the CPU, generically speaking, that "do what?" is a very valid question. The ptp4l program from the linuxptp stack already attempts to do something: it calls setsockopt(IP_ADD_MEMBERSHIP) (and PACKET_ADD_MEMBERSHIP, respectively) which translates in both cases into a dev_mc_add() on the interface, in the kernel: https://github.com/richardcochran/linuxptp/blob/v3.1.1/udp.c#L73 https://github.com/richardcochran/linuxptp/blob/v3.1.1/raw.c Reality shows that this is not sufficient in case the interface belongs to a switchdev driver, as dev_mc_add() does not show the intention to trap a packet to the CPU, but rather the intention to not drop it (it is strictly for RX filtering, same as promiscuous does not mean to send all traffic to the CPU, but to not drop traffic with unknown MAC DA). This topic is a can of worms in itself, and it would be great if user space could just stay out of it. On the other hand, setting up PTP traps privately within the driver is not new by any stretch of the imagination: https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/ethernet/mellanox/mlxsw/spectrum_ptp.c#L833 https://elixir.bootlin.com/linux/v5.16-rc2/source/drivers/net/dsa/hirschmann/hellcreek.c#L1050 https://elixir.bootlin.com/linux/v5.16-rc2/source/include/linux/dsa/sja1105.h#L21 So this is the approach taken here as well. The difference here being that we prepare and destroy the traps per port, dynamically at runtime, as opposed to driver init time, because apparently, PTP-unaware forwarding is a use case. Fixes: 4e3b0468e6d7 ("net: mscc: PTP Hardware Clock (PHC) support") Reported-by: Po Liu <po.liu@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Richard Cochran <richardcochran@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-26 17:28:44 +00:00
return ocelot_vcap_filter_del(ocelot, trap);
return ocelot_vcap_filter_replace(ocelot, trap);
}
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
static u32 ocelot_get_bond_mask(struct ocelot *ocelot, struct net_device *bond)
{
u32 mask = 0;
int port;
lockdep_assert_held(&ocelot->fwd_domain_lock);
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port)
continue;
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
if (ocelot_port->bond == bond)
mask |= BIT(port);
}
return mask;
}
/* The logical port number of a LAG is equal to the lowest numbered physical
* port ID present in that LAG. It may change if that port ever leaves the LAG.
*/
net: dsa: felix: add support for changing DSA master Changing the DSA master means different things depending on the tagging protocol in use. For NPI mode ("ocelot" and "seville"), there is a single port which can be configured as NPI, but DSA only permits changing the CPU port affinity of user ports one by one. So changing a user port to a different NPI port globally changes what the NPI port is, and breaks the user ports still using the old one. To address this while still permitting the change of the NPI port, require that the user ports which are still affine to the old NPI port are down, and cannot be brought up until they are all affine to the same NPI port. The tag_8021q mode ("ocelot-8021q") is more flexible, in that each user port can be freely assigned to one CPU port or to the other. This works by filtering host addresses towards both tag_8021q CPU ports, and then restricting the forwarding from a certain user port only to one of the two tag_8021q CPU ports. Additionally, the 2 tag_8021q CPU ports can be placed in a LAG. This works by enabling forwarding via PGID_SRC from a certain user port towards the logical port ID containing both tag_8021q CPU ports, but then restricting forwarding per packet, via the LAG hash codes in PGID_AGGR, to either one or the other. When we change the DSA master to a LAG device, DSA guarantees us that the LAG has at least one lower interface as a physical DSA master. But DSA masters can come and go as lowers of that LAG, and ds->ops->port_change_master() will not get called, because the DSA master is still the same (the LAG). So we need to hook into the ds->ops->port_lag_{join,leave} calls on the CPU ports and update the logical port ID of the LAG that user ports are assigned to. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-09-11 01:07:06 +00:00
int ocelot_bond_get_id(struct ocelot *ocelot, struct net_device *bond)
{
int bond_mask = ocelot_get_bond_mask(ocelot, bond);
if (!bond_mask)
return -ENOENT;
return __ffs(bond_mask);
}
net: dsa: felix: add support for changing DSA master Changing the DSA master means different things depending on the tagging protocol in use. For NPI mode ("ocelot" and "seville"), there is a single port which can be configured as NPI, but DSA only permits changing the CPU port affinity of user ports one by one. So changing a user port to a different NPI port globally changes what the NPI port is, and breaks the user ports still using the old one. To address this while still permitting the change of the NPI port, require that the user ports which are still affine to the old NPI port are down, and cannot be brought up until they are all affine to the same NPI port. The tag_8021q mode ("ocelot-8021q") is more flexible, in that each user port can be freely assigned to one CPU port or to the other. This works by filtering host addresses towards both tag_8021q CPU ports, and then restricting the forwarding from a certain user port only to one of the two tag_8021q CPU ports. Additionally, the 2 tag_8021q CPU ports can be placed in a LAG. This works by enabling forwarding via PGID_SRC from a certain user port towards the logical port ID containing both tag_8021q CPU ports, but then restricting forwarding per packet, via the LAG hash codes in PGID_AGGR, to either one or the other. When we change the DSA master to a LAG device, DSA guarantees us that the LAG has at least one lower interface as a physical DSA master. But DSA masters can come and go as lowers of that LAG, and ds->ops->port_change_master() will not get called, because the DSA master is still the same (the LAG). So we need to hook into the ds->ops->port_lag_{join,leave} calls on the CPU ports and update the logical port ID of the LAG that user ports are assigned to. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-09-11 01:07:06 +00:00
EXPORT_SYMBOL_GPL(ocelot_bond_get_id);
net: mscc: ocelot: adjust forwarding domain for CPU ports in a LAG Currently when we have 2 CPU ports configured for DSA tag_8021q mode and we put them in a LAG, a PGID dump looks like this: PGID_SRC[0] = ports 4, PGID_SRC[1] = ports 4, PGID_SRC[2] = ports 4, PGID_SRC[3] = ports 4, PGID_SRC[4] = ports 0, 1, 2, 3, 4, 5, PGID_SRC[5] = no ports (ports 0-3 are user ports, ports 4 and 5 are CPU ports) There are 2 problems with the configuration above: - user ports should enable forwarding towards both CPU ports, not just 4, and the aggregation PGIDs should prune one CPU port or the other from the destination port mask, based on a hash computed from packet headers. - CPU ports should not be allowed to forward towards themselves and also not towards other ports in the same LAG as themselves The first problem requires fixing up the PGID_SRC of user ports, when ocelot_port_assigned_dsa_8021q_cpu_mask() is called. We need to say that when a user port is assigned to a tag_8021q CPU port and that port is in a LAG, it should forward towards all ports in that LAG. The second problem requires fixing up the PGID_SRC of port 4, to remove ports 4 and 5 (in a LAG) from the allowed destinations. After this change, the PGID source masks look as follows: PGID_SRC[0] = ports 4, 5, PGID_SRC[1] = ports 4, 5, PGID_SRC[2] = ports 4, 5, PGID_SRC[3] = ports 4, 5, PGID_SRC[4] = ports 0, 1, 2, 3, PGID_SRC[5] = no ports Note that PGID_SRC[5] still looks weird (it should say "0, 1, 2, 3" just like PGID_SRC[4] does), but I've tested forwarding through this CPU port and it doesn't seem like anything is affected (it appears that PGID_SRC[4] is being looked up on forwarding from the CPU, since both ports 4 and 5 have logical port ID 4). The reason why it looks weird is because we've never called ocelot_port_assign_dsa_8021q_cpu() for any user port towards port 5 (all user ports are assigned to port 4 which is in a LAG with 5). Since things aren't broken, I'm willing to leave it like that for now and just document the oddity. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:20 +00:00
/* Returns the mask of user ports assigned to this DSA tag_8021q CPU port.
* Note that when CPU ports are in a LAG, the user ports are assigned to the
* 'primary' CPU port, the one whose physical port number gives the logical
* port number of the LAG.
*
* We leave PGID_SRC poorly configured for the 'secondary' CPU port in the LAG
* (to which no user port is assigned), but it appears that forwarding from
* this secondary CPU port looks at the PGID_SRC associated with the logical
* port ID that it's assigned to, which *is* configured properly.
*/
static u32 ocelot_dsa_8021q_cpu_assigned_ports(struct ocelot *ocelot,
struct ocelot_port *cpu)
{
u32 mask = 0;
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port)
continue;
if (ocelot_port->dsa_8021q_cpu == cpu)
mask |= BIT(port);
}
net: mscc: ocelot: adjust forwarding domain for CPU ports in a LAG Currently when we have 2 CPU ports configured for DSA tag_8021q mode and we put them in a LAG, a PGID dump looks like this: PGID_SRC[0] = ports 4, PGID_SRC[1] = ports 4, PGID_SRC[2] = ports 4, PGID_SRC[3] = ports 4, PGID_SRC[4] = ports 0, 1, 2, 3, 4, 5, PGID_SRC[5] = no ports (ports 0-3 are user ports, ports 4 and 5 are CPU ports) There are 2 problems with the configuration above: - user ports should enable forwarding towards both CPU ports, not just 4, and the aggregation PGIDs should prune one CPU port or the other from the destination port mask, based on a hash computed from packet headers. - CPU ports should not be allowed to forward towards themselves and also not towards other ports in the same LAG as themselves The first problem requires fixing up the PGID_SRC of user ports, when ocelot_port_assigned_dsa_8021q_cpu_mask() is called. We need to say that when a user port is assigned to a tag_8021q CPU port and that port is in a LAG, it should forward towards all ports in that LAG. The second problem requires fixing up the PGID_SRC of port 4, to remove ports 4 and 5 (in a LAG) from the allowed destinations. After this change, the PGID source masks look as follows: PGID_SRC[0] = ports 4, 5, PGID_SRC[1] = ports 4, 5, PGID_SRC[2] = ports 4, 5, PGID_SRC[3] = ports 4, 5, PGID_SRC[4] = ports 0, 1, 2, 3, PGID_SRC[5] = no ports Note that PGID_SRC[5] still looks weird (it should say "0, 1, 2, 3" just like PGID_SRC[4] does), but I've tested forwarding through this CPU port and it doesn't seem like anything is affected (it appears that PGID_SRC[4] is being looked up on forwarding from the CPU, since both ports 4 and 5 have logical port ID 4). The reason why it looks weird is because we've never called ocelot_port_assign_dsa_8021q_cpu() for any user port towards port 5 (all user ports are assigned to port 4 which is in a LAG with 5). Since things aren't broken, I'm willing to leave it like that for now and just document the oddity. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:20 +00:00
if (cpu->bond)
mask &= ~ocelot_get_bond_mask(ocelot, cpu->bond);
return mask;
}
net: mscc: ocelot: adjust forwarding domain for CPU ports in a LAG Currently when we have 2 CPU ports configured for DSA tag_8021q mode and we put them in a LAG, a PGID dump looks like this: PGID_SRC[0] = ports 4, PGID_SRC[1] = ports 4, PGID_SRC[2] = ports 4, PGID_SRC[3] = ports 4, PGID_SRC[4] = ports 0, 1, 2, 3, 4, 5, PGID_SRC[5] = no ports (ports 0-3 are user ports, ports 4 and 5 are CPU ports) There are 2 problems with the configuration above: - user ports should enable forwarding towards both CPU ports, not just 4, and the aggregation PGIDs should prune one CPU port or the other from the destination port mask, based on a hash computed from packet headers. - CPU ports should not be allowed to forward towards themselves and also not towards other ports in the same LAG as themselves The first problem requires fixing up the PGID_SRC of user ports, when ocelot_port_assigned_dsa_8021q_cpu_mask() is called. We need to say that when a user port is assigned to a tag_8021q CPU port and that port is in a LAG, it should forward towards all ports in that LAG. The second problem requires fixing up the PGID_SRC of port 4, to remove ports 4 and 5 (in a LAG) from the allowed destinations. After this change, the PGID source masks look as follows: PGID_SRC[0] = ports 4, 5, PGID_SRC[1] = ports 4, 5, PGID_SRC[2] = ports 4, 5, PGID_SRC[3] = ports 4, 5, PGID_SRC[4] = ports 0, 1, 2, 3, PGID_SRC[5] = no ports Note that PGID_SRC[5] still looks weird (it should say "0, 1, 2, 3" just like PGID_SRC[4] does), but I've tested forwarding through this CPU port and it doesn't seem like anything is affected (it appears that PGID_SRC[4] is being looked up on forwarding from the CPU, since both ports 4 and 5 have logical port ID 4). The reason why it looks weird is because we've never called ocelot_port_assign_dsa_8021q_cpu() for any user port towards port 5 (all user ports are assigned to port 4 which is in a LAG with 5). Since things aren't broken, I'm willing to leave it like that for now and just document the oddity. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:20 +00:00
/* Returns the DSA tag_8021q CPU port that the given port is assigned to,
* or the bit mask of CPU ports if said CPU port is in a LAG.
*/
u32 ocelot_port_assigned_dsa_8021q_cpu_mask(struct ocelot *ocelot, int port)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
struct ocelot_port *cpu_port = ocelot_port->dsa_8021q_cpu;
if (!cpu_port)
return 0;
net: mscc: ocelot: adjust forwarding domain for CPU ports in a LAG Currently when we have 2 CPU ports configured for DSA tag_8021q mode and we put them in a LAG, a PGID dump looks like this: PGID_SRC[0] = ports 4, PGID_SRC[1] = ports 4, PGID_SRC[2] = ports 4, PGID_SRC[3] = ports 4, PGID_SRC[4] = ports 0, 1, 2, 3, 4, 5, PGID_SRC[5] = no ports (ports 0-3 are user ports, ports 4 and 5 are CPU ports) There are 2 problems with the configuration above: - user ports should enable forwarding towards both CPU ports, not just 4, and the aggregation PGIDs should prune one CPU port or the other from the destination port mask, based on a hash computed from packet headers. - CPU ports should not be allowed to forward towards themselves and also not towards other ports in the same LAG as themselves The first problem requires fixing up the PGID_SRC of user ports, when ocelot_port_assigned_dsa_8021q_cpu_mask() is called. We need to say that when a user port is assigned to a tag_8021q CPU port and that port is in a LAG, it should forward towards all ports in that LAG. The second problem requires fixing up the PGID_SRC of port 4, to remove ports 4 and 5 (in a LAG) from the allowed destinations. After this change, the PGID source masks look as follows: PGID_SRC[0] = ports 4, 5, PGID_SRC[1] = ports 4, 5, PGID_SRC[2] = ports 4, 5, PGID_SRC[3] = ports 4, 5, PGID_SRC[4] = ports 0, 1, 2, 3, PGID_SRC[5] = no ports Note that PGID_SRC[5] still looks weird (it should say "0, 1, 2, 3" just like PGID_SRC[4] does), but I've tested forwarding through this CPU port and it doesn't seem like anything is affected (it appears that PGID_SRC[4] is being looked up on forwarding from the CPU, since both ports 4 and 5 have logical port ID 4). The reason why it looks weird is because we've never called ocelot_port_assign_dsa_8021q_cpu() for any user port towards port 5 (all user ports are assigned to port 4 which is in a LAG with 5). Since things aren't broken, I'm willing to leave it like that for now and just document the oddity. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:20 +00:00
if (cpu_port->bond)
return ocelot_get_bond_mask(ocelot, cpu_port->bond);
return BIT(cpu_port->index);
}
EXPORT_SYMBOL_GPL(ocelot_port_assigned_dsa_8021q_cpu_mask);
u32 ocelot_get_bridge_fwd_mask(struct ocelot *ocelot, int src_port)
{
struct ocelot_port *ocelot_port = ocelot->ports[src_port];
const struct net_device *bridge;
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
u32 mask = 0;
int port;
if (!ocelot_port || ocelot_port->stp_state != BR_STATE_FORWARDING)
return 0;
bridge = ocelot_port->bridge;
if (!bridge)
return 0;
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
for (port = 0; port < ocelot->num_phys_ports; port++) {
ocelot_port = ocelot->ports[port];
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
if (!ocelot_port)
continue;
if (ocelot_port->stp_state == BR_STATE_FORWARDING &&
ocelot_port->bridge == bridge)
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
mask |= BIT(port);
}
return mask;
}
EXPORT_SYMBOL_GPL(ocelot_get_bridge_fwd_mask);
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
static void ocelot_apply_bridge_fwd_mask(struct ocelot *ocelot, bool joining)
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
{
int port;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
lockdep_assert_held(&ocelot->fwd_domain_lock);
/* If cut-through forwarding is supported, update the masks before a
* port joins the forwarding domain, to avoid potential underruns if it
* has the highest speed from the new domain.
*/
if (joining && ocelot->ops->cut_through_fwd)
ocelot->ops->cut_through_fwd(ocelot);
/* Apply FWD mask. The loop is needed to add/remove the current port as
* a source for the other ports.
*/
for (port = 0; port < ocelot->num_phys_ports; port++) {
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
struct ocelot_port *ocelot_port = ocelot->ports[port];
unsigned long mask;
if (!ocelot_port) {
/* Unused ports can't send anywhere */
mask = 0;
} else if (ocelot_port->is_dsa_8021q_cpu) {
/* The DSA tag_8021q CPU ports need to be able to
* forward packets to all ports assigned to them.
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
*/
mask = ocelot_dsa_8021q_cpu_assigned_ports(ocelot,
ocelot_port);
} else if (ocelot_port->bridge) {
struct net_device *bond = ocelot_port->bond;
mask = ocelot_get_bridge_fwd_mask(ocelot, port);
mask &= ~BIT(port);
mask |= ocelot_port_assigned_dsa_8021q_cpu_mask(ocelot,
port);
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
if (bond)
mask &= ~ocelot_get_bond_mask(ocelot, bond);
} else {
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
/* Standalone ports forward only to DSA tag_8021q CPU
* ports (if those exist), or to the hardware CPU port
* module otherwise.
*/
mask = ocelot_port_assigned_dsa_8021q_cpu_mask(ocelot,
port);
}
net: dsa: felix: perform switch setup for tag_8021q Unlike sja1105, the only other user of the software-defined tag_8021q.c tagger format, the implementation we choose for the Felix DSA switch driver preserves full functionality under a vlan_filtering bridge (i.e. IP termination works through the DSA user ports under all circumstances). The tag_8021q protocol just wants: - Identifying the ingress switch port based on the RX VLAN ID, as seen by the CPU. We achieve this by using the TCAM engines (which are also used for tc-flower offload) to push the RX VLAN as a second, outer tag, on egress towards the CPU port. - Steering traffic injected into the switch from the network stack towards the correct front port based on the TX VLAN, and consuming (popping) that header on the switch's egress. A tc-flower pseudocode of the static configuration done by the driver would look like this: $ tc qdisc add dev <cpu-port> clsact $ for eth in swp0 swp1 swp2 swp3; do \ tc filter add dev <cpu-port> egress flower indev ${eth} \ action vlan push id <rxvlan> protocol 802.1ad; \ tc filter add dev <cpu-port> ingress protocol 802.1Q flower vlan_id <txvlan> action vlan pop \ action mirred egress redirect dev ${eth}; \ done but of course since DSA does not register network interfaces for the CPU port, this configuration would be impossible for the user to do. Also, due to the same reason, it is impossible for the user to inadvertently delete these rules using tc. These rules do not collide in any way with tc-flower, they just consume some TCAM space, which is something we can live with. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-01-29 01:00:09 +00:00
ocelot_write_rix(ocelot, mask, ANA_PGID_PGID, PGID_SRC + port);
}
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
/* If cut-through forwarding is supported and a port is leaving, there
* is a chance that cut-through was disabled on the other ports due to
* the port which is leaving (it has a higher link speed). We need to
* update the cut-through masks of the remaining ports no earlier than
* after the port has left, to prevent underruns from happening between
* the cut-through update and the forwarding domain update.
*/
if (!joining && ocelot->ops->cut_through_fwd)
ocelot->ops->cut_through_fwd(ocelot);
}
/* Update PGID_CPU which is the destination port mask used for whitelisting
* unicast addresses filtered towards the host. In the normal and NPI modes,
* this points to the analyzer entry for the CPU port module, while in DSA
* tag_8021q mode, it is a bit mask of all active CPU ports.
* PGID_SRC will take care of forwarding a packet from one user port to
* no more than a single CPU port.
*/
static void ocelot_update_pgid_cpu(struct ocelot *ocelot)
{
int pgid_cpu = 0;
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port || !ocelot_port->is_dsa_8021q_cpu)
continue;
pgid_cpu |= BIT(port);
}
if (!pgid_cpu)
pgid_cpu = BIT(ocelot->num_phys_ports);
ocelot_write_rix(ocelot, pgid_cpu, ANA_PGID_PGID, PGID_CPU);
}
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
void ocelot_port_setup_dsa_8021q_cpu(struct ocelot *ocelot, int cpu)
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
{
struct ocelot_port *cpu_port = ocelot->ports[cpu];
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
u16 vid;
mutex_lock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
cpu_port->is_dsa_8021q_cpu = true;
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++)
ocelot_vlan_member_add(ocelot, cpu, vid, true);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
ocelot_update_pgid_cpu(ocelot);
mutex_unlock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
}
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
EXPORT_SYMBOL_GPL(ocelot_port_setup_dsa_8021q_cpu);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
void ocelot_port_teardown_dsa_8021q_cpu(struct ocelot *ocelot, int cpu)
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
{
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
struct ocelot_port *cpu_port = ocelot->ports[cpu];
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
u16 vid;
mutex_lock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
cpu_port->is_dsa_8021q_cpu = false;
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
for (vid = OCELOT_RSV_VLAN_RANGE_START; vid < VLAN_N_VID; vid++)
ocelot_vlan_member_del(ocelot, cpu_port->index, vid);
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
ocelot_update_pgid_cpu(ocelot);
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL_GPL(ocelot_port_teardown_dsa_8021q_cpu);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
void ocelot_port_assign_dsa_8021q_cpu(struct ocelot *ocelot, int port,
int cpu)
{
struct ocelot_port *cpu_port = ocelot->ports[cpu];
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: set up tag_8021q CPU ports independent of user port affinity This is a partial revert of commit c295f9831f1d ("net: mscc: ocelot: switch from {,un}set to {,un}assign for tag_8021q CPU ports"), because as it turns out, this isn't how tag_8021q CPU ports under a LAG are supposed to work. Under that scenario, all user ports are "assigned" to the single tag_8021q CPU port represented by the logical port corresponding to the bonding interface. So one CPU port in a LAG would have is_dsa_8021q_cpu set to true (the one whose physical port ID is equal to the logical port ID), and the other one to false. In turn, this makes 2 undesirable things happen: (1) PGID_CPU contains only the first physical CPU port, rather than both (2) only the first CPU port will be added to the private VLANs used by ocelot for VLAN-unaware bridging To make the driver behave in the same way for both bonded CPU ports, we need to bring back the old concept of setting up a port as a tag_8021q CPU port, and this is what deals with VLAN membership and PGID_CPU updating. But we also need the CPU port "assignment" (the user to CPU port affinity), and this is what updates the PGID_SRC forwarding rules. All DSA CPU ports are statically configured for tag_8021q mode when the tagging protocol is changed to ocelot-8021q. User ports are "assigned" to one CPU port or the other dynamically (this will be handled by a future change). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
2022-08-19 17:48:19 +00:00
ocelot->ports[port]->dsa_8021q_cpu = cpu_port;
ocelot_apply_bridge_fwd_mask(ocelot, true);
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL_GPL(ocelot_port_assign_dsa_8021q_cpu);
void ocelot_port_unassign_dsa_8021q_cpu(struct ocelot *ocelot, int port)
{
mutex_lock(&ocelot->fwd_domain_lock);
ocelot->ports[port]->dsa_8021q_cpu = NULL;
ocelot_apply_bridge_fwd_mask(ocelot, true);
mutex_unlock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
}
EXPORT_SYMBOL_GPL(ocelot_port_unassign_dsa_8021q_cpu);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
void ocelot_bridge_stp_state_set(struct ocelot *ocelot, int port, u8 state)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
u32 learn_ena = 0;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
ocelot_port->stp_state = state;
if ((state == BR_STATE_LEARNING || state == BR_STATE_FORWARDING) &&
ocelot_port->learn_ena)
learn_ena = ANA_PORT_PORT_CFG_LEARN_ENA;
ocelot_rmw_gix(ocelot, learn_ena, ANA_PORT_PORT_CFG_LEARN_ENA,
ANA_PORT_PORT_CFG, port);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_apply_bridge_fwd_mask(ocelot, state == BR_STATE_FORWARDING);
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL(ocelot_bridge_stp_state_set);
void ocelot_set_ageing_time(struct ocelot *ocelot, unsigned int msecs)
{
unsigned int age_period = ANA_AUTOAGE_AGE_PERIOD(msecs / 2000);
/* Setting AGE_PERIOD to zero effectively disables automatic aging,
* which is clearly not what our intention is. So avoid that.
*/
if (!age_period)
age_period = 1;
ocelot_rmw(ocelot, age_period, ANA_AUTOAGE_AGE_PERIOD_M, ANA_AUTOAGE);
}
EXPORT_SYMBOL(ocelot_set_ageing_time);
static struct ocelot_multicast *ocelot_multicast_get(struct ocelot *ocelot,
const unsigned char *addr,
u16 vid)
{
struct ocelot_multicast *mc;
list_for_each_entry(mc, &ocelot->multicast, list) {
if (ether_addr_equal(mc->addr, addr) && mc->vid == vid)
return mc;
}
return NULL;
}
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
static enum macaccess_entry_type ocelot_classify_mdb(const unsigned char *addr)
{
if (addr[0] == 0x01 && addr[1] == 0x00 && addr[2] == 0x5e)
return ENTRYTYPE_MACv4;
if (addr[0] == 0x33 && addr[1] == 0x33)
return ENTRYTYPE_MACv6;
return ENTRYTYPE_LOCKED;
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
}
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
static struct ocelot_pgid *ocelot_pgid_alloc(struct ocelot *ocelot, int index,
unsigned long ports)
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
{
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
struct ocelot_pgid *pgid;
pgid = kzalloc(sizeof(*pgid), GFP_KERNEL);
if (!pgid)
return ERR_PTR(-ENOMEM);
pgid->ports = ports;
pgid->index = index;
refcount_set(&pgid->refcount, 1);
list_add_tail(&pgid->list, &ocelot->pgids);
return pgid;
}
static void ocelot_pgid_free(struct ocelot *ocelot, struct ocelot_pgid *pgid)
{
if (!refcount_dec_and_test(&pgid->refcount))
return;
list_del(&pgid->list);
kfree(pgid);
}
static struct ocelot_pgid *ocelot_mdb_get_pgid(struct ocelot *ocelot,
const struct ocelot_multicast *mc)
{
struct ocelot_pgid *pgid;
int index;
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
/* According to VSC7514 datasheet 3.9.1.5 IPv4 Multicast Entries and
* 3.9.1.6 IPv6 Multicast Entries, "Instead of a lookup in the
* destination mask table (PGID), the destination set is programmed as
* part of the entry MAC address.", and the DEST_IDX is set to 0.
*/
if (mc->entry_type == ENTRYTYPE_MACv4 ||
mc->entry_type == ENTRYTYPE_MACv6)
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
return ocelot_pgid_alloc(ocelot, 0, mc->ports);
list_for_each_entry(pgid, &ocelot->pgids, list) {
/* When searching for a nonreserved multicast PGID, ignore the
* dummy PGID of zero that we have for MACv4/MACv6 entries
*/
if (pgid->index && pgid->ports == mc->ports) {
refcount_inc(&pgid->refcount);
return pgid;
}
}
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
/* Search for a free index in the nonreserved multicast PGID area */
for_each_nonreserved_multicast_dest_pgid(ocelot, index) {
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
bool used = false;
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
list_for_each_entry(pgid, &ocelot->pgids, list) {
if (pgid->index == index) {
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
used = true;
break;
}
}
if (!used)
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
return ocelot_pgid_alloc(ocelot, index, mc->ports);
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
}
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
return ERR_PTR(-ENOSPC);
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
}
static void ocelot_encode_ports_to_mdb(unsigned char *addr,
struct ocelot_multicast *mc)
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
{
ether_addr_copy(addr, mc->addr);
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
if (mc->entry_type == ENTRYTYPE_MACv4) {
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
addr[0] = 0;
addr[1] = mc->ports >> 8;
addr[2] = mc->ports & 0xff;
} else if (mc->entry_type == ENTRYTYPE_MACv6) {
net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries The current procedure for installing a multicast address is hardcoded for IPv4. But, in the ocelot hardware, there are 3 different procedures for IPv4, IPv6 and for regular L2 multicast. For IPv6 (33-33-xx-xx-xx-xx), it's the same as for IPv4 (01-00-5e-xx-xx-xx), except that the destination port mask is stuffed into first 2 bytes of the MAC address except into first 3 bytes. For plain Ethernet multicast, there's no port-in-address stuffing going on, instead the DEST_IDX (pointer to PGID) is used there, just as for unicast. So we have to use one of the nonreserved multicast PGIDs that the hardware has allocated for this purpose. This patch classifies the type of multicast address based on its first bytes, then redirects to one of the 3 different hardware procedures. Note that this gives us a really better way of redirecting PTP frames sent at 01-1b-19-00-00-00 to the CPU. Previously, Yangbo Lu tried to add a trapping rule for PTP EtherType but got a lot of pushback: https://patchwork.ozlabs.org/project/netdev/patch/20190813025214.18601-5-yangbo.lu@nxp.com/ But right now, that isn't needed at all. The application stack (ptp4l) does this for the PTP multicast addresses it's interested in (which are configurable, and include 01-1b-19-00-00-00): memset(&mreq, 0, sizeof(mreq)); mreq.mr_ifindex = index; mreq.mr_type = PACKET_MR_MULTICAST; mreq.mr_alen = MAC_LEN; memcpy(mreq.mr_address, addr1, MAC_LEN); err1 = setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); Into the kernel, this translates into a dev_mc_add on the switch network interfaces, and our drivers know that it means they should translate it into a host MDB address (make the CPU port be the destination). Previously, this was broken because all mdb addresses were treated as IPv4 (which 01-1b-19-00-00-00 obviously is not). Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-06-21 11:46:03 +00:00
addr[0] = mc->ports >> 8;
addr[1] = mc->ports & 0xff;
}
}
int ocelot_port_mdb_add(struct ocelot *ocelot, int port,
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
const struct switchdev_obj_port_mdb *mdb,
const struct net_device *bridge)
{
unsigned char addr[ETH_ALEN];
struct ocelot_multicast *mc;
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
struct ocelot_pgid *pgid;
u16 vid = mdb->vid;
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
mc = ocelot_multicast_get(ocelot, mdb->addr, vid);
if (!mc) {
/* New entry */
mc = devm_kzalloc(ocelot->dev, sizeof(*mc), GFP_KERNEL);
if (!mc)
return -ENOMEM;
mc->entry_type = ocelot_classify_mdb(mdb->addr);
ether_addr_copy(mc->addr, mdb->addr);
mc->vid = vid;
list_add_tail(&mc->list, &ocelot->multicast);
} else {
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
/* Existing entry. Clean up the current port mask from
* hardware now, because we'll be modifying it.
*/
ocelot_pgid_free(ocelot, mc->pgid);
ocelot_encode_ports_to_mdb(addr, mc);
ocelot_mact_forget(ocelot, addr, vid);
}
mc->ports |= BIT(port);
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
pgid = ocelot_mdb_get_pgid(ocelot, mc);
if (IS_ERR(pgid)) {
dev_err(ocelot->dev,
"Cannot allocate PGID for mdb %pM vid %d\n",
mc->addr, mc->vid);
devm_kfree(ocelot->dev, mc);
return PTR_ERR(pgid);
}
mc->pgid = pgid;
ocelot_encode_ports_to_mdb(addr, mc);
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
if (mc->entry_type != ENTRYTYPE_MACv4 &&
mc->entry_type != ENTRYTYPE_MACv6)
ocelot_write_rix(ocelot, pgid->ports, ANA_PGID_PGID,
pgid->index);
return ocelot_mact_learn(ocelot, pgid->index, addr, vid,
mc->entry_type);
}
EXPORT_SYMBOL(ocelot_port_mdb_add);
int ocelot_port_mdb_del(struct ocelot *ocelot, int port,
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
const struct switchdev_obj_port_mdb *mdb,
const struct net_device *bridge)
{
unsigned char addr[ETH_ALEN];
struct ocelot_multicast *mc;
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
struct ocelot_pgid *pgid;
u16 vid = mdb->vid;
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
mc = ocelot_multicast_get(ocelot, mdb->addr, vid);
if (!mc)
return -ENOENT;
ocelot_encode_ports_to_mdb(addr, mc);
ocelot_mact_forget(ocelot, addr, vid);
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
ocelot_pgid_free(ocelot, mc->pgid);
mc->ports &= ~BIT(port);
if (!mc->ports) {
list_del(&mc->list);
devm_kfree(ocelot->dev, mc);
return 0;
}
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
/* We have a PGID with fewer ports now */
pgid = ocelot_mdb_get_pgid(ocelot, mc);
if (IS_ERR(pgid))
return PTR_ERR(pgid);
mc->pgid = pgid;
ocelot_encode_ports_to_mdb(addr, mc);
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
if (mc->entry_type != ENTRYTYPE_MACv4 &&
mc->entry_type != ENTRYTYPE_MACv6)
ocelot_write_rix(ocelot, pgid->ports, ANA_PGID_PGID,
pgid->index);
return ocelot_mact_learn(ocelot, pgid->index, addr, vid,
mc->entry_type);
}
EXPORT_SYMBOL(ocelot_port_mdb_del);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
int ocelot_port_bridge_join(struct ocelot *ocelot, int port,
struct net_device *bridge, int bridge_num,
struct netlink_ext_ack *extack)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
int err;
err = ocelot_single_vlan_aware_bridge(ocelot, extack);
if (err)
return err;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
ocelot_port->bridge = bridge;
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
ocelot_port->bridge_num = bridge_num;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_apply_bridge_fwd_mask(ocelot, true);
mutex_unlock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (br_vlan_enabled(bridge))
return 0;
return ocelot_add_vlan_unaware_pvid(ocelot, port, bridge);
}
EXPORT_SYMBOL(ocelot_port_bridge_join);
void ocelot_port_bridge_leave(struct ocelot *ocelot, int port,
struct net_device *bridge)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
net: dsa: propagate switchdev vlan_filtering prepare phase to drivers A driver may refuse to enable VLAN filtering for any reason beyond what the DSA framework cares about, such as: - having tc-flower rules that rely on the switch being VLAN-aware - the particular switch does not support VLAN, even if the driver does (the DSA framework just checks for the presence of the .port_vlan_add and .port_vlan_del pointers) - simply not supporting this configuration to be toggled at runtime Currently, when a driver rejects a configuration it cannot support, it does this from the commit phase, which triggers various warnings in switchdev. So propagate the prepare phase to drivers, to give them the ability to refuse invalid configurations cleanly and avoid the warnings. Since we need to modify all function prototypes and check for the prepare phase from within the drivers, take that opportunity and move the existing driver restrictions within the prepare phase where that is possible and easy. Cc: Florian Fainelli <f.fainelli@gmail.com> Cc: Martin Blumenstingl <martin.blumenstingl@googlemail.com> Cc: Hauke Mehrtens <hauke@hauke-m.de> Cc: Woojung Huh <woojung.huh@microchip.com> Cc: Microchip Linux Driver Support <UNGLinuxDriver@microchip.com> Cc: Sean Wang <sean.wang@mediatek.com> Cc: Landen Chao <Landen.Chao@mediatek.com> Cc: Andrew Lunn <andrew@lunn.ch> Cc: Vivien Didelot <vivien.didelot@gmail.com> Cc: Jonathan McDowell <noodles@earth.li> Cc: Linus Walleij <linus.walleij@linaro.org> Cc: Alexandre Belloni <alexandre.belloni@bootlin.com> Cc: Claudiu Manoil <claudiu.manoil@nxp.com> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-10-02 22:06:46 +00:00
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!br_vlan_enabled(bridge))
ocelot_del_vlan_unaware_pvid(ocelot, port, bridge);
ocelot_port->bridge = NULL;
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
ocelot_port->bridge_num = -1;
ocelot_port_set_pvid(ocelot, port, NULL);
net: mscc: ocelot: allow a config where all bridge VLANs are egress-untagged At present, the ocelot driver accepts a single egress-untagged bridge VLAN, meaning that this sequence of operations: ip link add br0 type bridge vlan_filtering 1 ip link set swp0 master br0 bridge vlan add dev swp0 vid 2 pvid untagged fails because the bridge automatically installs VID 1 as a pvid & untagged VLAN, and vid 2 would be the second untagged VLAN on this port. It is necessary to delete VID 1 before proceeding to add VID 2. This limitation comes from the fact that we operate the port tag, when it has an egress-untagged VID, in the OCELOT_PORT_TAG_NATIVE mode. The ocelot switches do not have full flexibility and can either have one single VID as egress-untagged, or all of them. There are use cases for having all VLANs as egress-untagged as well, and this patch adds support for that. The change rewrites ocelot_port_set_native_vlan() into a more generic ocelot_port_manage_port_tag() function. Because the software bridge's state, transmitted to us via switchdev, can become very complex, we don't attempt to track all possible state transitions, but instead take a more declarative approach and just make ocelot_port_manage_port_tag() figure out which more to operate in: - port is VLAN-unaware: the classified VLAN (internal, unrelated to the 802.1Q header) is not inserted into packets on egress - port is VLAN-aware: - port has tagged VLANs: -> port has no untagged VLAN: set up as pure trunk -> port has one untagged VLAN: set up as trunk port + native VLAN -> port has more than one untagged VLAN: this is an invalid config which is rejected by ocelot_vlan_prepare - port has no tagged VLANs -> set up as pure egress-untagged port We don't keep the number of tagged and untagged VLANs, we just count the structures we keep. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:50 +00:00
ocelot_port_manage_port_tag(ocelot, port);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_apply_bridge_fwd_mask(ocelot, false);
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL(ocelot_port_bridge_leave);
static void ocelot_set_aggr_pgids(struct ocelot *ocelot)
{
unsigned long visited = GENMASK(ocelot->num_phys_ports - 1, 0);
int i, port, lag;
/* Reset destination and aggregation PGIDS */
for_each_unicast_dest_pgid(ocelot, port)
ocelot_write_rix(ocelot, BIT(port), ANA_PGID_PGID, port);
for_each_aggr_pgid(ocelot, i)
ocelot_write_rix(ocelot, GENMASK(ocelot->num_phys_ports - 1, 0),
ANA_PGID_PGID, i);
/* The visited ports bitmask holds the list of ports offloading any
* bonding interface. Initially we mark all these ports as unvisited,
* then every time we visit a port in this bitmask, we know that it is
* the lowest numbered port, i.e. the one whose logical ID == physical
* port ID == LAG ID. So we mark as visited all further ports in the
* bitmask that are offloading the same bonding interface. This way,
* we set up the aggregation PGIDs only once per bonding interface.
*/
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port || !ocelot_port->bond)
continue;
visited &= ~BIT(port);
}
/* Now, set PGIDs for each active LAG */
for (lag = 0; lag < ocelot->num_phys_ports; lag++) {
struct net_device *bond = ocelot->ports[lag]->bond;
int num_active_ports = 0;
unsigned long bond_mask;
u8 aggr_idx[16];
if (!bond || (visited & BIT(lag)))
continue;
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
bond_mask = ocelot_get_bond_mask(ocelot, bond);
for_each_set_bit(port, &bond_mask, ocelot->num_phys_ports) {
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
struct ocelot_port *ocelot_port = ocelot->ports[port];
// Destination mask
ocelot_write_rix(ocelot, bond_mask,
ANA_PGID_PGID, port);
net: mscc: ocelot: fix incorrect balancing with down LAG ports Assuming the test setup described here: https://patchwork.kernel.org/project/netdevbpf/cover/20210205130240.4072854-1-vladimir.oltean@nxp.com/ (swp1 and swp2 are in bond0, and bond0 is in a bridge with swp0) it can be seen that when swp1 goes down (on either board A or B), then traffic that should go through that port isn't forwarded anywhere. A dump of the PGID table shows the following: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1 PGID_DST[2] = ports 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 1, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 1, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 Whereas a "good" PGID configuration for that setup should have looked like this: PGID_DST[0] = ports 0 PGID_DST[1] = ports 1, 2 PGID_DST[2] = ports 1, 2 PGID_DST[3] = ports 3 PGID_DST[4] = ports 4 PGID_DST[5] = ports 5 PGID_DST[6] = no ports PGID_AGGR[0] = ports 0, 2, 3, 4, 5 PGID_AGGR[1] = ports 0, 2, 3, 4, 5 PGID_AGGR[2] = ports 0, 2, 3, 4, 5 PGID_AGGR[3] = ports 0, 2, 3, 4, 5 PGID_AGGR[4] = ports 0, 2, 3, 4, 5 PGID_AGGR[5] = ports 0, 2, 3, 4, 5 PGID_AGGR[6] = ports 0, 2, 3, 4, 5 PGID_AGGR[7] = ports 0, 2, 3, 4, 5 PGID_AGGR[8] = ports 0, 2, 3, 4, 5 PGID_AGGR[9] = ports 0, 2, 3, 4, 5 PGID_AGGR[10] = ports 0, 2, 3, 4, 5 PGID_AGGR[11] = ports 0, 2, 3, 4, 5 PGID_AGGR[12] = ports 0, 2, 3, 4, 5 PGID_AGGR[13] = ports 0, 2, 3, 4, 5 PGID_AGGR[14] = ports 0, 2, 3, 4, 5 PGID_AGGR[15] = ports 0, 2, 3, 4, 5 PGID_SRC[0] = ports 1, 2 PGID_SRC[1] = ports 0 PGID_SRC[2] = ports 0 PGID_SRC[3] = no ports PGID_SRC[4] = no ports PGID_SRC[5] = no ports PGID_SRC[6] = ports 0, 1, 2, 3, 4, 5 In other words, in the "bad" configuration, the attempt is to remove the inactive swp1 from the destination ports via PGID_DST. But when a MAC table entry is learned, it is learned towards PGID_DST 1, because that is the logical port id of the LAG itself (it is equal to the lowest numbered member port). So when swp1 becomes inactive, if we set PGID_DST[1] to contain just swp1 and not swp2, the packet will not have any chance to reach the destination via swp2. The "correct" way to remove swp1 as a destination is via PGID_AGGR (remove swp1 from the aggregation port groups for all aggregation codes). This means that PGID_DST[1] and PGID_DST[2] must still contain both swp1 and swp2. This makes the MAC table still treat packets destined towards the single-port LAG as "multicast", and the inactive ports are removed via the aggregation code tables. The change presented here is a design one: the ocelot_get_bond_mask() function used to take an "only_active_ports" argument. We don't need that. The only call site that specifies only_active_ports=true, ocelot_set_aggr_pgids(), must retrieve the entire bonding mask, because it must program that into PGID_DST. Additionally, it must also clear the inactive ports from the bond mask here, which it can't do if bond_mask just contains the active ports: ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i); ac &= ~bond_mask; <---- here /* Don't do division by zero if there was no active * port. Just make all aggregation codes zero. */ if (num_active_ports) ac |= BIT(aggr_idx[i % num_active_ports]); ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i); So it becomes the responsibility of ocelot_set_aggr_pgids() to take ocelot_port->lag_tx_active into consideration when populating the aggr_idx array. Fixes: 23ca3b727ee6 ("net: mscc: ocelot: rebalance LAGs on link up/down events") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20220107164332.402133-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-01-07 16:43:32 +00:00
if (ocelot_port->lag_tx_active)
aggr_idx[num_active_ports++] = port;
}
for_each_aggr_pgid(ocelot, i) {
u32 ac;
ac = ocelot_read_rix(ocelot, ANA_PGID_PGID, i);
ac &= ~bond_mask;
/* Don't do division by zero if there was no active
* port. Just make all aggregation codes zero.
*/
if (num_active_ports)
ac |= BIT(aggr_idx[i % num_active_ports]);
ocelot_write_rix(ocelot, ac, ANA_PGID_PGID, i);
}
/* Mark all ports in the same LAG as visited to avoid applying
* the same config again.
*/
for (port = lag; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
if (!ocelot_port)
continue;
if (ocelot_port->bond == bond)
visited |= BIT(port);
}
}
}
/* When offloading a bonding interface, the switch ports configured under the
* same bond must have the same logical port ID, equal to the physical port ID
* of the lowest numbered physical port in that bond. Otherwise, in standalone/
* bridged mode, each port has a logical port ID equal to its physical port ID.
*/
static void ocelot_setup_logical_port_ids(struct ocelot *ocelot)
{
int port;
for (port = 0; port < ocelot->num_phys_ports; port++) {
struct ocelot_port *ocelot_port = ocelot->ports[port];
struct net_device *bond;
if (!ocelot_port)
continue;
bond = ocelot_port->bond;
if (bond) {
int lag = ocelot_bond_get_id(ocelot, bond);
ocelot_rmw_gix(ocelot,
ANA_PORT_PORT_CFG_PORTID_VAL(lag),
ANA_PORT_PORT_CFG_PORTID_VAL_M,
ANA_PORT_PORT_CFG, port);
} else {
ocelot_rmw_gix(ocelot,
ANA_PORT_PORT_CFG_PORTID_VAL(port),
ANA_PORT_PORT_CFG_PORTID_VAL_M,
ANA_PORT_PORT_CFG, port);
}
}
}
static int ocelot_migrate_mc(struct ocelot *ocelot, struct ocelot_multicast *mc,
unsigned long from_mask, unsigned long to_mask)
{
unsigned char addr[ETH_ALEN];
struct ocelot_pgid *pgid;
u16 vid = mc->vid;
dev_dbg(ocelot->dev,
"Migrating multicast %pM vid %d from port mask 0x%lx to 0x%lx\n",
mc->addr, mc->vid, from_mask, to_mask);
/* First clean up the current port mask from hardware, because
* we'll be modifying it.
*/
ocelot_pgid_free(ocelot, mc->pgid);
ocelot_encode_ports_to_mdb(addr, mc);
ocelot_mact_forget(ocelot, addr, vid);
mc->ports &= ~from_mask;
mc->ports |= to_mask;
pgid = ocelot_mdb_get_pgid(ocelot, mc);
if (IS_ERR(pgid)) {
dev_err(ocelot->dev,
"Cannot allocate PGID for mdb %pM vid %d\n",
mc->addr, mc->vid);
devm_kfree(ocelot->dev, mc);
return PTR_ERR(pgid);
}
mc->pgid = pgid;
ocelot_encode_ports_to_mdb(addr, mc);
if (mc->entry_type != ENTRYTYPE_MACv4 &&
mc->entry_type != ENTRYTYPE_MACv6)
ocelot_write_rix(ocelot, pgid->ports, ANA_PGID_PGID,
pgid->index);
return ocelot_mact_learn(ocelot, pgid->index, addr, vid,
mc->entry_type);
}
int ocelot_migrate_mdbs(struct ocelot *ocelot, unsigned long from_mask,
unsigned long to_mask)
{
struct ocelot_multicast *mc;
int err;
list_for_each_entry(mc, &ocelot->multicast, list) {
if (!(mc->ports & from_mask))
continue;
err = ocelot_migrate_mc(ocelot, mc, from_mask, to_mask);
if (err)
return err;
}
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_migrate_mdbs);
/* Documentation for PORTID_VAL says:
* Logical port number for front port. If port is not a member of a LLAG,
* then PORTID must be set to the physical port number.
* If port is a member of a LLAG, then PORTID must be set to the common
* PORTID_VAL used for all member ports of the LLAG.
* The value must not exceed the number of physical ports on the device.
*
* This means we have little choice but to migrate FDB entries pointing towards
* a logical port when that changes.
*/
static void ocelot_migrate_lag_fdbs(struct ocelot *ocelot,
struct net_device *bond,
int lag)
{
struct ocelot_lag_fdb *fdb;
int err;
lockdep_assert_held(&ocelot->fwd_domain_lock);
list_for_each_entry(fdb, &ocelot->lag_fdbs, list) {
if (fdb->bond != bond)
continue;
err = ocelot_mact_forget(ocelot, fdb->addr, fdb->vid);
if (err) {
dev_err(ocelot->dev,
"failed to delete LAG %s FDB %pM vid %d: %pe\n",
bond->name, fdb->addr, fdb->vid, ERR_PTR(err));
}
err = ocelot_mact_learn(ocelot, lag, fdb->addr, fdb->vid,
ENTRYTYPE_LOCKED);
if (err) {
dev_err(ocelot->dev,
"failed to migrate LAG %s FDB %pM vid %d: %pe\n",
bond->name, fdb->addr, fdb->vid, ERR_PTR(err));
}
}
}
int ocelot_port_lag_join(struct ocelot *ocelot, int port,
struct net_device *bond,
struct netdev_lag_upper_info *info,
struct netlink_ext_ack *extack)
{
if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
NL_SET_ERR_MSG_MOD(extack,
"Can only offload LAG using hash TX type");
return -EOPNOTSUPP;
}
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
ocelot->ports[port]->bond = bond;
ocelot_setup_logical_port_ids(ocelot);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_apply_bridge_fwd_mask(ocelot, true);
ocelot_set_aggr_pgids(ocelot);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_unlock(&ocelot->fwd_domain_lock);
return 0;
}
EXPORT_SYMBOL(ocelot_port_lag_join);
void ocelot_port_lag_leave(struct ocelot *ocelot, int port,
struct net_device *bond)
{
int old_lag_id, new_lag_id;
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
old_lag_id = ocelot_bond_get_id(ocelot, bond);
ocelot->ports[port]->bond = NULL;
ocelot_setup_logical_port_ids(ocelot);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
ocelot_apply_bridge_fwd_mask(ocelot, false);
ocelot_set_aggr_pgids(ocelot);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
new_lag_id = ocelot_bond_get_id(ocelot, bond);
if (new_lag_id >= 0 && old_lag_id != new_lag_id)
ocelot_migrate_lag_fdbs(ocelot, bond, new_lag_id);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL(ocelot_port_lag_leave);
void ocelot_port_lag_change(struct ocelot *ocelot, int port, bool lag_tx_active)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
mutex_lock(&ocelot->fwd_domain_lock);
ocelot_port->lag_tx_active = lag_tx_active;
/* Rebalance the LAGs */
ocelot_set_aggr_pgids(ocelot);
mutex_unlock(&ocelot->fwd_domain_lock);
}
EXPORT_SYMBOL(ocelot_port_lag_change);
int ocelot_lag_fdb_add(struct ocelot *ocelot, struct net_device *bond,
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
const unsigned char *addr, u16 vid,
const struct net_device *bridge)
{
struct ocelot_lag_fdb *fdb;
int lag, err;
fdb = kzalloc(sizeof(*fdb), GFP_KERNEL);
if (!fdb)
return -ENOMEM;
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
mutex_lock(&ocelot->fwd_domain_lock);
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
ether_addr_copy(fdb->addr, addr);
fdb->vid = vid;
fdb->bond = bond;
lag = ocelot_bond_get_id(ocelot, bond);
err = ocelot_mact_learn(ocelot, lag, addr, vid, ENTRYTYPE_LOCKED);
if (err) {
mutex_unlock(&ocelot->fwd_domain_lock);
kfree(fdb);
return err;
}
list_add_tail(&fdb->list, &ocelot->lag_fdbs);
mutex_unlock(&ocelot->fwd_domain_lock);
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_lag_fdb_add);
int ocelot_lag_fdb_del(struct ocelot *ocelot, struct net_device *bond,
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
const unsigned char *addr, u16 vid,
const struct net_device *bridge)
{
struct ocelot_lag_fdb *fdb, *tmp;
mutex_lock(&ocelot->fwd_domain_lock);
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
if (!vid)
vid = ocelot_vlan_unaware_pvid(ocelot, bridge);
list_for_each_entry_safe(fdb, tmp, &ocelot->lag_fdbs, list) {
if (!ether_addr_equal(fdb->addr, addr) || fdb->vid != vid ||
fdb->bond != bond)
continue;
ocelot_mact_forget(ocelot, addr, vid);
list_del(&fdb->list);
mutex_unlock(&ocelot->fwd_domain_lock);
kfree(fdb);
return 0;
}
mutex_unlock(&ocelot->fwd_domain_lock);
return -ENOENT;
}
EXPORT_SYMBOL_GPL(ocelot_lag_fdb_del);
/* Configure the maximum SDU (L2 payload) on RX to the value specified in @sdu.
* The length of VLAN tags is accounted for automatically via DEV_MAC_TAGS_CFG.
* In the special case that it's the NPI port that we're configuring, the
* length of the tag and optional prefix needs to be accounted for privately,
* in order to be able to sustain communication at the requested @sdu.
*/
void ocelot_port_set_maxlen(struct ocelot *ocelot, int port, size_t sdu)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
int maxlen = sdu + ETH_HLEN + ETH_FCS_LEN;
int pause_start, pause_stop;
int atop, atop_tot;
if (port == ocelot->npi) {
maxlen += OCELOT_TAG_LEN;
if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_SHORT)
maxlen += OCELOT_SHORT_PREFIX_LEN;
else if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_LONG)
maxlen += OCELOT_LONG_PREFIX_LEN;
}
ocelot_port_writel(ocelot_port, maxlen, DEV_MAC_MAXLEN_CFG);
/* Set Pause watermark hysteresis */
pause_start = 6 * maxlen / OCELOT_BUFFER_CELL_SZ;
pause_stop = 4 * maxlen / OCELOT_BUFFER_CELL_SZ;
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_START,
pause_start);
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_STOP,
pause_stop);
/* Tail dropping watermarks */
atop_tot = (ocelot->packet_buffer_size - 9 * maxlen) /
OCELOT_BUFFER_CELL_SZ;
atop = (9 * maxlen) / OCELOT_BUFFER_CELL_SZ;
ocelot_write_rix(ocelot, ocelot->ops->wm_enc(atop), SYS_ATOP, port);
ocelot_write(ocelot, ocelot->ops->wm_enc(atop_tot), SYS_ATOP_TOT_CFG);
}
EXPORT_SYMBOL(ocelot_port_set_maxlen);
int ocelot_get_max_mtu(struct ocelot *ocelot, int port)
{
int max_mtu = 65535 - ETH_HLEN - ETH_FCS_LEN;
if (port == ocelot->npi) {
max_mtu -= OCELOT_TAG_LEN;
if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_SHORT)
max_mtu -= OCELOT_SHORT_PREFIX_LEN;
else if (ocelot->npi_inj_prefix == OCELOT_TAG_PREFIX_LONG)
max_mtu -= OCELOT_LONG_PREFIX_LEN;
}
return max_mtu;
}
EXPORT_SYMBOL(ocelot_get_max_mtu);
static void ocelot_port_set_learning(struct ocelot *ocelot, int port,
bool enabled)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
u32 val = 0;
if (enabled)
val = ANA_PORT_PORT_CFG_LEARN_ENA;
ocelot_rmw_gix(ocelot, val, ANA_PORT_PORT_CFG_LEARN_ENA,
ANA_PORT_PORT_CFG, port);
ocelot_port->learn_ena = enabled;
}
static void ocelot_port_set_ucast_flood(struct ocelot *ocelot, int port,
bool enabled)
{
u32 val = 0;
if (enabled)
val = BIT(port);
ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_UC);
}
static void ocelot_port_set_mcast_flood(struct ocelot *ocelot, int port,
bool enabled)
{
u32 val = 0;
if (enabled)
val = BIT(port);
ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MC);
ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MCIPV4);
ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_MCIPV6);
}
static void ocelot_port_set_bcast_flood(struct ocelot *ocelot, int port,
bool enabled)
{
u32 val = 0;
if (enabled)
val = BIT(port);
ocelot_rmw_rix(ocelot, val, BIT(port), ANA_PGID_PGID, PGID_BC);
}
int ocelot_port_pre_bridge_flags(struct ocelot *ocelot, int port,
struct switchdev_brport_flags flags)
{
if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
BR_BCAST_FLOOD))
return -EINVAL;
return 0;
}
EXPORT_SYMBOL(ocelot_port_pre_bridge_flags);
void ocelot_port_bridge_flags(struct ocelot *ocelot, int port,
struct switchdev_brport_flags flags)
{
if (flags.mask & BR_LEARNING)
ocelot_port_set_learning(ocelot, port,
!!(flags.val & BR_LEARNING));
if (flags.mask & BR_FLOOD)
ocelot_port_set_ucast_flood(ocelot, port,
!!(flags.val & BR_FLOOD));
if (flags.mask & BR_MCAST_FLOOD)
ocelot_port_set_mcast_flood(ocelot, port,
!!(flags.val & BR_MCAST_FLOOD));
if (flags.mask & BR_BCAST_FLOOD)
ocelot_port_set_bcast_flood(ocelot, port,
!!(flags.val & BR_BCAST_FLOOD));
}
EXPORT_SYMBOL(ocelot_port_bridge_flags);
net: dsa: felix: configure default-prio and dscp priorities Follow the established programming model for this driver and provide shims in the felix DSA driver which call the implementations from the ocelot switch lib. The ocelot switchdev driver wasn't integrated with dcbnl due to lack of hardware availability. The switch doesn't have any fancy QoS classification enabled by default. The provided getters will create a default-prio app table entry of 0, and no dscp entry. However, the getters have been made to actually retrieve the hardware configuration rather than static values, to be future proof in case DSA will need this information from more call paths. For default-prio, there is a single field per port, in ANA_PORT_QOS_CFG, called QOS_DEFAULT_VAL. DSCP classification is enabled per-port, again via ANA_PORT_QOS_CFG (field QOS_DSCP_ENA), and individual DSCP values are configured as trusted or not through register ANA_DSCP_CFG (replicated 64 times). An untrusted DSCP value falls back to other QoS classification methods. If trusted, the selected ANA_DSCP_CFG register also holds the QoS class in the QOS_DSCP_VAL field. The hardware also supports DSCP remapping (DSCP value X is translated to DSCP value Y before the QoS class is determined based on the app table entry for Y) and DSCP packet rewriting. The dcbnl framework, for being so flexible in other useless areas, doesn't appear to support this. So this functionality has been left out. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-11 21:15:20 +00:00
int ocelot_port_get_default_prio(struct ocelot *ocelot, int port)
{
int val = ocelot_read_gix(ocelot, ANA_PORT_QOS_CFG, port);
return ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL_X(val);
}
EXPORT_SYMBOL_GPL(ocelot_port_get_default_prio);
int ocelot_port_set_default_prio(struct ocelot *ocelot, int port, u8 prio)
{
if (prio >= OCELOT_NUM_TC)
net: dsa: felix: configure default-prio and dscp priorities Follow the established programming model for this driver and provide shims in the felix DSA driver which call the implementations from the ocelot switch lib. The ocelot switchdev driver wasn't integrated with dcbnl due to lack of hardware availability. The switch doesn't have any fancy QoS classification enabled by default. The provided getters will create a default-prio app table entry of 0, and no dscp entry. However, the getters have been made to actually retrieve the hardware configuration rather than static values, to be future proof in case DSA will need this information from more call paths. For default-prio, there is a single field per port, in ANA_PORT_QOS_CFG, called QOS_DEFAULT_VAL. DSCP classification is enabled per-port, again via ANA_PORT_QOS_CFG (field QOS_DSCP_ENA), and individual DSCP values are configured as trusted or not through register ANA_DSCP_CFG (replicated 64 times). An untrusted DSCP value falls back to other QoS classification methods. If trusted, the selected ANA_DSCP_CFG register also holds the QoS class in the QOS_DSCP_VAL field. The hardware also supports DSCP remapping (DSCP value X is translated to DSCP value Y before the QoS class is determined based on the app table entry for Y) and DSCP packet rewriting. The dcbnl framework, for being so flexible in other useless areas, doesn't appear to support this. So this functionality has been left out. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-11 21:15:20 +00:00
return -ERANGE;
ocelot_rmw_gix(ocelot,
ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL(prio),
ANA_PORT_QOS_CFG_QOS_DEFAULT_VAL_M,
ANA_PORT_QOS_CFG,
port);
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_port_set_default_prio);
int ocelot_port_get_dscp_prio(struct ocelot *ocelot, int port, u8 dscp)
{
int qos_cfg = ocelot_read_gix(ocelot, ANA_PORT_QOS_CFG, port);
int dscp_cfg = ocelot_read_rix(ocelot, ANA_DSCP_CFG, dscp);
/* Return error if DSCP prioritization isn't enabled */
if (!(qos_cfg & ANA_PORT_QOS_CFG_QOS_DSCP_ENA))
return -EOPNOTSUPP;
if (qos_cfg & ANA_PORT_QOS_CFG_DSCP_TRANSLATE_ENA) {
dscp = ANA_DSCP_CFG_DSCP_TRANSLATE_VAL_X(dscp_cfg);
/* Re-read ANA_DSCP_CFG for the translated DSCP */
dscp_cfg = ocelot_read_rix(ocelot, ANA_DSCP_CFG, dscp);
}
/* If the DSCP value is not trusted, the QoS classification falls back
* to VLAN PCP or port-based default.
*/
if (!(dscp_cfg & ANA_DSCP_CFG_DSCP_TRUST_ENA))
return -EOPNOTSUPP;
return ANA_DSCP_CFG_QOS_DSCP_VAL_X(dscp_cfg);
}
EXPORT_SYMBOL_GPL(ocelot_port_get_dscp_prio);
int ocelot_port_add_dscp_prio(struct ocelot *ocelot, int port, u8 dscp, u8 prio)
{
int mask, val;
if (prio >= OCELOT_NUM_TC)
net: dsa: felix: configure default-prio and dscp priorities Follow the established programming model for this driver and provide shims in the felix DSA driver which call the implementations from the ocelot switch lib. The ocelot switchdev driver wasn't integrated with dcbnl due to lack of hardware availability. The switch doesn't have any fancy QoS classification enabled by default. The provided getters will create a default-prio app table entry of 0, and no dscp entry. However, the getters have been made to actually retrieve the hardware configuration rather than static values, to be future proof in case DSA will need this information from more call paths. For default-prio, there is a single field per port, in ANA_PORT_QOS_CFG, called QOS_DEFAULT_VAL. DSCP classification is enabled per-port, again via ANA_PORT_QOS_CFG (field QOS_DSCP_ENA), and individual DSCP values are configured as trusted or not through register ANA_DSCP_CFG (replicated 64 times). An untrusted DSCP value falls back to other QoS classification methods. If trusted, the selected ANA_DSCP_CFG register also holds the QoS class in the QOS_DSCP_VAL field. The hardware also supports DSCP remapping (DSCP value X is translated to DSCP value Y before the QoS class is determined based on the app table entry for Y) and DSCP packet rewriting. The dcbnl framework, for being so flexible in other useless areas, doesn't appear to support this. So this functionality has been left out. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-03-11 21:15:20 +00:00
return -ERANGE;
/* There is at least one app table priority (this one), so we need to
* make sure DSCP prioritization is enabled on the port.
* Also make sure DSCP translation is disabled
* (dcbnl doesn't support it).
*/
mask = ANA_PORT_QOS_CFG_QOS_DSCP_ENA |
ANA_PORT_QOS_CFG_DSCP_TRANSLATE_ENA;
ocelot_rmw_gix(ocelot, ANA_PORT_QOS_CFG_QOS_DSCP_ENA, mask,
ANA_PORT_QOS_CFG, port);
/* Trust this DSCP value and map it to the given QoS class */
val = ANA_DSCP_CFG_DSCP_TRUST_ENA | ANA_DSCP_CFG_QOS_DSCP_VAL(prio);
ocelot_write_rix(ocelot, val, ANA_DSCP_CFG, dscp);
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_port_add_dscp_prio);
int ocelot_port_del_dscp_prio(struct ocelot *ocelot, int port, u8 dscp, u8 prio)
{
int dscp_cfg = ocelot_read_rix(ocelot, ANA_DSCP_CFG, dscp);
int mask, i;
/* During a "dcb app replace" command, the new app table entry will be
* added first, then the old one will be deleted. But the hardware only
* supports one QoS class per DSCP value (duh), so if we blindly delete
* the app table entry for this DSCP value, we end up deleting the
* entry with the new priority. Avoid that by checking whether user
* space wants to delete the priority which is currently configured, or
* something else which is no longer current.
*/
if (ANA_DSCP_CFG_QOS_DSCP_VAL_X(dscp_cfg) != prio)
return 0;
/* Untrust this DSCP value */
ocelot_write_rix(ocelot, 0, ANA_DSCP_CFG, dscp);
for (i = 0; i < 64; i++) {
int dscp_cfg = ocelot_read_rix(ocelot, ANA_DSCP_CFG, i);
/* There are still app table entries on the port, so we need to
* keep DSCP enabled, nothing to do.
*/
if (dscp_cfg & ANA_DSCP_CFG_DSCP_TRUST_ENA)
return 0;
}
/* Disable DSCP QoS classification if there isn't any trusted
* DSCP value left.
*/
mask = ANA_PORT_QOS_CFG_QOS_DSCP_ENA |
ANA_PORT_QOS_CFG_DSCP_TRANSLATE_ENA;
ocelot_rmw_gix(ocelot, 0, mask, ANA_PORT_QOS_CFG, port);
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_port_del_dscp_prio);
struct ocelot_mirror *ocelot_mirror_get(struct ocelot *ocelot, int to,
struct netlink_ext_ack *extack)
net: mscc: ocelot: add port mirroring support using tc-matchall Ocelot switches perform port-based ingress mirroring if ANA:PORT:PORT_CFG field SRC_MIRROR_ENA is set, and egress mirroring if the port is in ANA:ANA:EMIRRORPORTS. Both ingress-mirrored and egress-mirrored frames are copied to the port mask from ANA:ANA:MIRRORPORTS. So the choice of limiting to a single mirror port via ocelot_mirror_get() and ocelot_mirror_put() may seem bizarre, but the hardware model doesn't map very well to the user space model. If the user wants to mirror the ingress of swp1 towards swp2 and the ingress of swp3 towards swp4, we'd have to program ANA:ANA:MIRRORPORTS with BIT(2) | BIT(4), and that would make swp1 be mirrored towards swp4 too, and swp3 towards swp2. But there are no tc-matchall rules to describe those actions. Now, we could offload a matchall rule with multiple mirred actions, one per desired mirror port, and force the user to stick to the multi-action rule format for subsequent matchall filters. But both DSA and ocelot have the flow_offload_has_one_action() check for the matchall offload, plus the fact that it will get cumbersome to cross-check matchall mirrors with flower mirrors (which will be added in the next patch). As a result, we limit the configuration to a single mirror port, with the possibility of lifting the restriction in the future. Frames injected from the CPU don't get egress-mirrored, since they are sent with the BYPASS bit in the injection frame header, and this bypasses the analyzer module (effectively also the mirroring logic). I don't know what to do/say about this. Functionality was tested with: tc qdisc add dev swp3 clsact tc filter add dev swp3 ingress \ matchall skip_sw \ action mirred egress mirror dev swp1 and pinging through swp3, while seeing that the ICMP replies are mirrored towards swp1. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-03-16 20:41:40 +00:00
{
struct ocelot_mirror *m = ocelot->mirror;
if (m) {
if (m->to != to) {
NL_SET_ERR_MSG_MOD(extack,
"Mirroring already configured towards different egress port");
return ERR_PTR(-EBUSY);
}
refcount_inc(&m->refcount);
return m;
}
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (!m)
return ERR_PTR(-ENOMEM);
m->to = to;
refcount_set(&m->refcount, 1);
ocelot->mirror = m;
/* Program the mirror port to hardware */
ocelot_write(ocelot, BIT(to), ANA_MIRRORPORTS);
return m;
}
void ocelot_mirror_put(struct ocelot *ocelot)
net: mscc: ocelot: add port mirroring support using tc-matchall Ocelot switches perform port-based ingress mirroring if ANA:PORT:PORT_CFG field SRC_MIRROR_ENA is set, and egress mirroring if the port is in ANA:ANA:EMIRRORPORTS. Both ingress-mirrored and egress-mirrored frames are copied to the port mask from ANA:ANA:MIRRORPORTS. So the choice of limiting to a single mirror port via ocelot_mirror_get() and ocelot_mirror_put() may seem bizarre, but the hardware model doesn't map very well to the user space model. If the user wants to mirror the ingress of swp1 towards swp2 and the ingress of swp3 towards swp4, we'd have to program ANA:ANA:MIRRORPORTS with BIT(2) | BIT(4), and that would make swp1 be mirrored towards swp4 too, and swp3 towards swp2. But there are no tc-matchall rules to describe those actions. Now, we could offload a matchall rule with multiple mirred actions, one per desired mirror port, and force the user to stick to the multi-action rule format for subsequent matchall filters. But both DSA and ocelot have the flow_offload_has_one_action() check for the matchall offload, plus the fact that it will get cumbersome to cross-check matchall mirrors with flower mirrors (which will be added in the next patch). As a result, we limit the configuration to a single mirror port, with the possibility of lifting the restriction in the future. Frames injected from the CPU don't get egress-mirrored, since they are sent with the BYPASS bit in the injection frame header, and this bypasses the analyzer module (effectively also the mirroring logic). I don't know what to do/say about this. Functionality was tested with: tc qdisc add dev swp3 clsact tc filter add dev swp3 ingress \ matchall skip_sw \ action mirred egress mirror dev swp1 and pinging through swp3, while seeing that the ICMP replies are mirrored towards swp1. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-03-16 20:41:40 +00:00
{
struct ocelot_mirror *m = ocelot->mirror;
if (!refcount_dec_and_test(&m->refcount))
return;
ocelot_write(ocelot, 0, ANA_MIRRORPORTS);
ocelot->mirror = NULL;
kfree(m);
}
int ocelot_port_mirror_add(struct ocelot *ocelot, int from, int to,
bool ingress, struct netlink_ext_ack *extack)
{
struct ocelot_mirror *m = ocelot_mirror_get(ocelot, to, extack);
if (IS_ERR(m))
return PTR_ERR(m);
if (ingress) {
ocelot_rmw_gix(ocelot, ANA_PORT_PORT_CFG_SRC_MIRROR_ENA,
ANA_PORT_PORT_CFG_SRC_MIRROR_ENA,
ANA_PORT_PORT_CFG, from);
} else {
ocelot_rmw(ocelot, BIT(from), BIT(from),
ANA_EMIRRORPORTS);
}
return 0;
}
EXPORT_SYMBOL_GPL(ocelot_port_mirror_add);
void ocelot_port_mirror_del(struct ocelot *ocelot, int from, bool ingress)
{
if (ingress) {
ocelot_rmw_gix(ocelot, 0, ANA_PORT_PORT_CFG_SRC_MIRROR_ENA,
ANA_PORT_PORT_CFG, from);
} else {
ocelot_rmw(ocelot, 0, BIT(from), ANA_EMIRRORPORTS);
}
ocelot_mirror_put(ocelot);
}
EXPORT_SYMBOL_GPL(ocelot_port_mirror_del);
void ocelot_init_port(struct ocelot *ocelot, int port)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
skb_queue_head_init(&ocelot_port->tx_skbs);
/* Basic L2 initialization */
/* Set MAC IFG Gaps
* FDX: TX_IFG = 5, RX_IFG1 = RX_IFG2 = 0
* !FDX: TX_IFG = 5, RX_IFG1 = RX_IFG2 = 5
*/
ocelot_port_writel(ocelot_port, DEV_MAC_IFG_CFG_TX_IFG(5),
DEV_MAC_IFG_CFG);
/* Load seed (0) and set MAC HDX late collision */
ocelot_port_writel(ocelot_port, DEV_MAC_HDX_CFG_LATE_COL_POS(67) |
DEV_MAC_HDX_CFG_SEED_LOAD,
DEV_MAC_HDX_CFG);
mdelay(1);
ocelot_port_writel(ocelot_port, DEV_MAC_HDX_CFG_LATE_COL_POS(67),
DEV_MAC_HDX_CFG);
/* Set Max Length and maximum tags allowed */
ocelot_port_set_maxlen(ocelot, port, ETH_DATA_LEN);
ocelot_port_writel(ocelot_port, DEV_MAC_TAGS_CFG_TAG_ID(ETH_P_8021AD) |
DEV_MAC_TAGS_CFG_VLAN_AWR_ENA |
DEV_MAC_TAGS_CFG_VLAN_DBL_AWR_ENA |
DEV_MAC_TAGS_CFG_VLAN_LEN_AWR_ENA,
DEV_MAC_TAGS_CFG);
/* Set SMAC of Pause frame (00:00:00:00:00:00) */
ocelot_port_writel(ocelot_port, 0, DEV_MAC_FC_MAC_HIGH_CFG);
ocelot_port_writel(ocelot_port, 0, DEV_MAC_FC_MAC_LOW_CFG);
/* Enable transmission of pause frames */
ocelot_fields_write(ocelot, port, SYS_PAUSE_CFG_PAUSE_ENA, 1);
/* Drop frames with multicast source address */
ocelot_rmw_gix(ocelot, ANA_PORT_DROP_CFG_DROP_MC_SMAC_ENA,
ANA_PORT_DROP_CFG_DROP_MC_SMAC_ENA,
ANA_PORT_DROP_CFG, port);
/* Set default VLAN and tag type to 8021Q. */
ocelot_rmw_gix(ocelot, REW_PORT_VLAN_CFG_PORT_TPID(ETH_P_8021Q),
REW_PORT_VLAN_CFG_PORT_TPID_M,
REW_PORT_VLAN_CFG, port);
/* Disable source address learning for standalone mode */
ocelot_port_set_learning(ocelot, port, false);
/* Set the port's initial logical port ID value, enable receiving
* frames on it, and configure the MAC address learning type to
* automatic.
*/
ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_LEARNAUTO |
ANA_PORT_PORT_CFG_RECV_ENA |
ANA_PORT_PORT_CFG_PORTID_VAL(port),
ANA_PORT_PORT_CFG, port);
/* Enable vcap lookups */
ocelot_vcap_enable(ocelot, port);
}
EXPORT_SYMBOL(ocelot_init_port);
/* Configure and enable the CPU port module, which is a set of queues
* accessible through register MMIO, frame DMA or Ethernet (in case
* NPI mode is used).
net: mscc: ocelot: eliminate confusion between CPU and NPI port Ocelot has the concept of a CPU port. The CPU port is represented in the forwarding and the queueing system, but it is not a physical device. The CPU port can either be accessed via register-based injection/extraction (which is the case of Ocelot), via Frame-DMA (similar to the first one), or "connected" to a physical Ethernet port (called NPI in the datasheet) which is the case of the Felix DSA switch. In Ocelot the CPU port is at index 11. In Felix the CPU port is at index 6. The CPU bit is treated special in the forwarding, as it is never cleared from the forwarding port mask (once added to it). Other than that, it is treated the same as a normal front port. Both Felix and Ocelot should use the CPU port in the same way. This means that Felix should not use the NPI port directly when forwarding to the CPU, but instead use the CPU port. This patch is fixing this such that Felix will use port 6 as its CPU port, and just use the NPI port to carry the traffic. Therefore, eliminate the "ocelot->cpu" variable which was holding the index of the NPI port for Felix, and the index of the CPU port module for Ocelot, so the variable was actually configuring different things for different drivers and causing at least part of the confusion. Also remove the "ocelot->num_cpu_ports" variable, which is the result of another confusion. The 2 CPU ports mentioned in the datasheet are because there are two frame extraction channels (register based or DMA based). This is of no relevance to the driver at the moment, and invisible to the analyzer module. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Suggested-by: Allan W. Nielsen <allan.nielsen@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-29 14:50:02 +00:00
*/
static void ocelot_cpu_port_init(struct ocelot *ocelot)
{
net: mscc: ocelot: eliminate confusion between CPU and NPI port Ocelot has the concept of a CPU port. The CPU port is represented in the forwarding and the queueing system, but it is not a physical device. The CPU port can either be accessed via register-based injection/extraction (which is the case of Ocelot), via Frame-DMA (similar to the first one), or "connected" to a physical Ethernet port (called NPI in the datasheet) which is the case of the Felix DSA switch. In Ocelot the CPU port is at index 11. In Felix the CPU port is at index 6. The CPU bit is treated special in the forwarding, as it is never cleared from the forwarding port mask (once added to it). Other than that, it is treated the same as a normal front port. Both Felix and Ocelot should use the CPU port in the same way. This means that Felix should not use the NPI port directly when forwarding to the CPU, but instead use the CPU port. This patch is fixing this such that Felix will use port 6 as its CPU port, and just use the NPI port to carry the traffic. Therefore, eliminate the "ocelot->cpu" variable which was holding the index of the NPI port for Felix, and the index of the CPU port module for Ocelot, so the variable was actually configuring different things for different drivers and causing at least part of the confusion. Also remove the "ocelot->num_cpu_ports" variable, which is the result of another confusion. The 2 CPU ports mentioned in the datasheet are because there are two frame extraction channels (register based or DMA based). This is of no relevance to the driver at the moment, and invisible to the analyzer module. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Suggested-by: Allan W. Nielsen <allan.nielsen@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-29 14:50:02 +00:00
int cpu = ocelot->num_phys_ports;
/* The unicast destination PGID for the CPU port module is unused */
ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, cpu);
net: mscc: ocelot: eliminate confusion between CPU and NPI port Ocelot has the concept of a CPU port. The CPU port is represented in the forwarding and the queueing system, but it is not a physical device. The CPU port can either be accessed via register-based injection/extraction (which is the case of Ocelot), via Frame-DMA (similar to the first one), or "connected" to a physical Ethernet port (called NPI in the datasheet) which is the case of the Felix DSA switch. In Ocelot the CPU port is at index 11. In Felix the CPU port is at index 6. The CPU bit is treated special in the forwarding, as it is never cleared from the forwarding port mask (once added to it). Other than that, it is treated the same as a normal front port. Both Felix and Ocelot should use the CPU port in the same way. This means that Felix should not use the NPI port directly when forwarding to the CPU, but instead use the CPU port. This patch is fixing this such that Felix will use port 6 as its CPU port, and just use the NPI port to carry the traffic. Therefore, eliminate the "ocelot->cpu" variable which was holding the index of the NPI port for Felix, and the index of the CPU port module for Ocelot, so the variable was actually configuring different things for different drivers and causing at least part of the confusion. Also remove the "ocelot->num_cpu_ports" variable, which is the result of another confusion. The 2 CPU ports mentioned in the datasheet are because there are two frame extraction channels (register based or DMA based). This is of no relevance to the driver at the moment, and invisible to the analyzer module. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Suggested-by: Allan W. Nielsen <allan.nielsen@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-29 14:50:02 +00:00
/* Instead set up a multicast destination PGID for traffic copied to
* the CPU. Whitelisted MAC addresses like the port netdevice MAC
* addresses will be copied to the CPU via this PGID.
*/
ocelot_write_rix(ocelot, BIT(cpu), ANA_PGID_PGID, PGID_CPU);
ocelot_write_gix(ocelot, ANA_PORT_PORT_CFG_RECV_ENA |
ANA_PORT_PORT_CFG_PORTID_VAL(cpu),
ANA_PORT_PORT_CFG, cpu);
net: mscc: ocelot: eliminate confusion between CPU and NPI port Ocelot has the concept of a CPU port. The CPU port is represented in the forwarding and the queueing system, but it is not a physical device. The CPU port can either be accessed via register-based injection/extraction (which is the case of Ocelot), via Frame-DMA (similar to the first one), or "connected" to a physical Ethernet port (called NPI in the datasheet) which is the case of the Felix DSA switch. In Ocelot the CPU port is at index 11. In Felix the CPU port is at index 6. The CPU bit is treated special in the forwarding, as it is never cleared from the forwarding port mask (once added to it). Other than that, it is treated the same as a normal front port. Both Felix and Ocelot should use the CPU port in the same way. This means that Felix should not use the NPI port directly when forwarding to the CPU, but instead use the CPU port. This patch is fixing this such that Felix will use port 6 as its CPU port, and just use the NPI port to carry the traffic. Therefore, eliminate the "ocelot->cpu" variable which was holding the index of the NPI port for Felix, and the index of the CPU port module for Ocelot, so the variable was actually configuring different things for different drivers and causing at least part of the confusion. Also remove the "ocelot->num_cpu_ports" variable, which is the result of another confusion. The 2 CPU ports mentioned in the datasheet are because there are two frame extraction channels (register based or DMA based). This is of no relevance to the driver at the moment, and invisible to the analyzer module. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Suggested-by: Allan W. Nielsen <allan.nielsen@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-29 14:50:02 +00:00
/* Enable CPU port module */
ocelot_fields_write(ocelot, cpu, QSYS_SWITCH_PORT_MODE_PORT_ENA, 1);
net: mscc: ocelot: eliminate confusion between CPU and NPI port Ocelot has the concept of a CPU port. The CPU port is represented in the forwarding and the queueing system, but it is not a physical device. The CPU port can either be accessed via register-based injection/extraction (which is the case of Ocelot), via Frame-DMA (similar to the first one), or "connected" to a physical Ethernet port (called NPI in the datasheet) which is the case of the Felix DSA switch. In Ocelot the CPU port is at index 11. In Felix the CPU port is at index 6. The CPU bit is treated special in the forwarding, as it is never cleared from the forwarding port mask (once added to it). Other than that, it is treated the same as a normal front port. Both Felix and Ocelot should use the CPU port in the same way. This means that Felix should not use the NPI port directly when forwarding to the CPU, but instead use the CPU port. This patch is fixing this such that Felix will use port 6 as its CPU port, and just use the NPI port to carry the traffic. Therefore, eliminate the "ocelot->cpu" variable which was holding the index of the NPI port for Felix, and the index of the CPU port module for Ocelot, so the variable was actually configuring different things for different drivers and causing at least part of the confusion. Also remove the "ocelot->num_cpu_ports" variable, which is the result of another confusion. The 2 CPU ports mentioned in the datasheet are because there are two frame extraction channels (register based or DMA based). This is of no relevance to the driver at the moment, and invisible to the analyzer module. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Suggested-by: Allan W. Nielsen <allan.nielsen@microchip.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2020-02-29 14:50:02 +00:00
/* CPU port Injection/Extraction configuration */
ocelot_fields_write(ocelot, cpu, SYS_PORT_MODE_INCL_XTR_HDR,
OCELOT_TAG_PREFIX_NONE);
ocelot_fields_write(ocelot, cpu, SYS_PORT_MODE_INCL_INJ_HDR,
OCELOT_TAG_PREFIX_NONE);
/* Configure the CPU port to be VLAN aware */
net: mscc: ocelot: add the local station MAC addresses in VID 0 The ocelot switchdev driver does not include the CPU port in the list of flooding destinations for unknown traffic, instead that traffic is supposed to match FDB entries to reach the CPU. The addresses it installs are: (a) the station MAC address, in ocelot_probe_port() and later during runtime in ocelot_port_set_mac_address(). These are the VLAN-unaware addresses. The VLAN-aware addresses are in ocelot_vlan_vid_add(). (b) multicast addresses added with dev_mc_add() (not bridge host MDB entries) in ocelot_mc_sync() (c) multicast destination MAC addresses for MRP in ocelot_mrp_save_mac(), to make sure those are dropped (not forwarded) by the bridging service, just trapped to the CPU So we can see that the logic is slightly buggy ever since the initial commit a556c76adc05 ("net: mscc: Add initial Ocelot switch support"). This is because, when ocelot_probe_port() runs, the port pvid is 0. Then we join a VLAN-aware bridge, the pvid becomes 1, we call ocelot_port_set_mac_address(), this learns the new MAC address in VID 1 (also fails to forget the old one, since it thinks it's in VID 1, but that's not so important). Then when we leave the VLAN-aware bridge, outside world is unable to ping our new MAC address because it isn't learned in VID 0, the VLAN-unaware pvid. [ note: this is strictly based on static analysis, I don't have hardware to test. But there are also many more corner cases ] The basic idea is that we should have a separation of concerns, and the FDB entries used for standalone operation should be managed by the driver, and the FDB entries used by the bridging service should be managed by the bridge. So the standalone and VLAN-unaware bridge FDB entries should not follow the bridge PVID, because that will only be active when the bridge is VLAN-aware. So since the port pvid is coincidentally zero during probe time, just make those entries statically go to VID 0. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:51 +00:00
ocelot_write_gix(ocelot,
net: mscc: ocelot: enforce FDB isolation when VLAN-unaware Currently ocelot uses a pvid of 0 for standalone ports and ports under a VLAN-unaware bridge, and the pvid of the bridge for ports under a VLAN-aware bridge. Standalone ports do not perform learning, but packets received on them are still subject to FDB lookups. So if the MAC DA that a standalone port receives has been also learned on a VLAN-unaware bridge port, ocelot will attempt to forward to that port, even though it can't, so it will drop packets. So there is a desire to avoid that, and isolate the FDBs of different bridges from one another, and from standalone ports. The ocelot switch library has two distinct entry points: the felix DSA driver and the ocelot switchdev driver. We need to code up a minimal bridge_num allocation in the ocelot switchdev driver too, this is copied from DSA with the exception that ocelot does not care about DSA trees, cross-chip bridging etc. So it only looks at its own ports that are already in the same bridge. The ocelot switchdev driver uses the bridge_num it has allocated itself, while the felix driver uses the bridge_num allocated by DSA. They are both stored inside ocelot_port->bridge_num by the common function ocelot_port_bridge_join() which receives the bridge_num passed by value. Once we have a bridge_num, we can only use it to enforce isolation between VLAN-unaware bridges. As far as I can see, ocelot does not have anything like a FID that further makes VLAN 100 from a port be different to VLAN 100 from another port with regard to FDB lookup. So we simply deny multiple VLAN-aware bridges. For VLAN-unaware bridges, we crop the 4000-4095 VLAN region and we allocate a VLAN for each bridge_num. This will be used as the pvid of each port that is under that VLAN-unaware bridge, for as long as that bridge is VLAN-unaware. VID 0 remains only for standalone ports. It is okay if all standalone ports use the same VID 0, since they perform no address learning, the FDB will contain no entry in VLAN 0, so the packets will always be flooded to the only possible destination, the CPU port. The CPU port module doesn't need to be member of the VLANs to receive packets, but if we use the DSA tag_8021q protocol, those packets are part of the data plane as far as ocelot is concerned, so there it needs to. Just ensure that the DSA tag_8021q CPU port is a member of all reserved VLANs when it is created, and is removed when it is deleted. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2022-02-25 09:22:25 +00:00
ANA_PORT_VLAN_CFG_VLAN_VID(OCELOT_STANDALONE_PVID) |
net: mscc: ocelot: add the local station MAC addresses in VID 0 The ocelot switchdev driver does not include the CPU port in the list of flooding destinations for unknown traffic, instead that traffic is supposed to match FDB entries to reach the CPU. The addresses it installs are: (a) the station MAC address, in ocelot_probe_port() and later during runtime in ocelot_port_set_mac_address(). These are the VLAN-unaware addresses. The VLAN-aware addresses are in ocelot_vlan_vid_add(). (b) multicast addresses added with dev_mc_add() (not bridge host MDB entries) in ocelot_mc_sync() (c) multicast destination MAC addresses for MRP in ocelot_mrp_save_mac(), to make sure those are dropped (not forwarded) by the bridging service, just trapped to the CPU So we can see that the logic is slightly buggy ever since the initial commit a556c76adc05 ("net: mscc: Add initial Ocelot switch support"). This is because, when ocelot_probe_port() runs, the port pvid is 0. Then we join a VLAN-aware bridge, the pvid becomes 1, we call ocelot_port_set_mac_address(), this learns the new MAC address in VID 1 (also fails to forget the old one, since it thinks it's in VID 1, but that's not so important). Then when we leave the VLAN-aware bridge, outside world is unable to ping our new MAC address because it isn't learned in VID 0, the VLAN-unaware pvid. [ note: this is strictly based on static analysis, I don't have hardware to test. But there are also many more corner cases ] The basic idea is that we should have a separation of concerns, and the FDB entries used for standalone operation should be managed by the driver, and the FDB entries used by the bridging service should be managed by the bridge. So the standalone and VLAN-unaware bridge FDB entries should not follow the bridge PVID, because that will only be active when the bridge is VLAN-aware. So since the port pvid is coincidentally zero during probe time, just make those entries statically go to VID 0. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:51 +00:00
ANA_PORT_VLAN_CFG_VLAN_AWARE_ENA |
ANA_PORT_VLAN_CFG_VLAN_POP_CNT(1),
ANA_PORT_VLAN_CFG, cpu);
}
static void ocelot_detect_features(struct ocelot *ocelot)
{
int mmgt, eq_ctrl;
/* For Ocelot, Felix, Seville, Serval etc, SYS:MMGT:MMGT:FREECNT holds
* the number of 240-byte free memory words (aka 4-cell chunks) and not
* 192 bytes as the documentation incorrectly says.
*/
mmgt = ocelot_read(ocelot, SYS_MMGT);
ocelot->packet_buffer_size = 240 * SYS_MMGT_FREECNT(mmgt);
eq_ctrl = ocelot_read(ocelot, QSYS_EQ_CTRL);
ocelot->num_frame_refs = QSYS_MMGT_EQ_CTRL_FP_FREE_CNT(eq_ctrl);
}
static int ocelot_mem_init_status(struct ocelot *ocelot)
{
unsigned int val;
int err;
err = regmap_field_read(ocelot->regfields[SYS_RESET_CFG_MEM_INIT],
&val);
return err ?: val;
}
int ocelot_reset(struct ocelot *ocelot)
{
int err;
u32 val;
err = regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_INIT], 1);
if (err)
return err;
err = regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_ENA], 1);
if (err)
return err;
/* MEM_INIT is a self-clearing bit. Wait for it to be cleared (should be
* 100us) before enabling the switch core.
*/
err = readx_poll_timeout(ocelot_mem_init_status, ocelot, val, !val,
MEM_INIT_SLEEP_US, MEM_INIT_TIMEOUT_US);
if (err)
return err;
err = regmap_field_write(ocelot->regfields[SYS_RESET_CFG_MEM_ENA], 1);
if (err)
return err;
return regmap_field_write(ocelot->regfields[SYS_RESET_CFG_CORE_ENA], 1);
}
EXPORT_SYMBOL(ocelot_reset);
int ocelot_init(struct ocelot *ocelot)
{
int i, ret;
u32 port;
if (ocelot->ops->reset) {
ret = ocelot->ops->reset(ocelot);
if (ret) {
dev_err(ocelot->dev, "Switch reset failed\n");
return ret;
}
}
mutex_init(&ocelot->ptp_lock);
mutex_init(&ocelot->mact_lock);
net: dsa: felix: enable cut-through forwarding between ports by default The VSC9959 switch embedded within NXP LS1028A (and that version of Ocelot switches only) supports cut-through forwarding - meaning it can start the process of looking up the destination ports for a packet, and forward towards those ports, before the entire packet has been received (as opposed to the store-and-forward mode). The up side is having lower forwarding latency for large packets. The down side is that frames with FCS errors are forwarded instead of being dropped. However, erroneous frames do not result in incorrect updates of the FDB or incorrect policer updates, since these processes are deferred inside the switch to the end of frame. Since the switch starts the cut-through forwarding process after all packet headers (including IP, if any) have been processed, packets with large headers and small payload do not see the benefit of lower forwarding latency. There are two cases that need special attention. The first is when a packet is multicast (or flooded) to multiple destinations, one of which doesn't have cut-through forwarding enabled. The switch deals with this automatically by disabling cut-through forwarding for the frame towards all destination ports. The second is when a packet is forwarded from a port of lower link speed towards a port of higher link speed. This is not handled by the hardware and needs software intervention. Since we practically need to update the cut-through forwarding domain from paths that aren't serialized by the rtnl_mutex (phylink mac_link_down/mac_link_up ops), this means we need to serialize physical link events with user space updates of bonding/bridging domains. Enabling cut-through forwarding is done per {egress port, traffic class}. I don't see any reason why this would be a configurable option as long as it works without issues, and there doesn't appear to be any user space configuration tool to toggle this on/off, so this patch enables cut-through forwarding on all eligible ports and traffic classes. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20211125125808.2383984-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2021-11-25 12:58:08 +00:00
mutex_init(&ocelot->fwd_domain_lock);
mutex_init(&ocelot->tas_lock);
spin_lock_init(&ocelot->ptp_clock_lock);
spin_lock_init(&ocelot->ts_id_lock);
ocelot->owq = alloc_ordered_workqueue("ocelot-owq", 0);
if (!ocelot->owq)
return -ENOMEM;
ret = ocelot_stats_init(ocelot);
if (ret)
goto err_stats_init;
INIT_LIST_HEAD(&ocelot->multicast);
net: mscc: ocelot: support L2 multicast entries There is one main difference in mscc_ocelot between IP multicast and L2 multicast. With IP multicast, destination ports are encoded into the upper bytes of the multicast MAC address. Example: to deliver the address 01:00:5E:11:22:33 to ports 3, 8, and 9, one would need to program the address of 00:03:08:11:22:33 into hardware. Whereas for L2 multicast, the MAC table entry points to a Port Group ID (PGID), and that PGID contains the port mask that the packet will be forwarded to. As to why it is this way, no clue. My guess is that not all port combinations can be supported simultaneously with the limited number of PGIDs, and this was somehow an issue for IP multicast but not for L2 multicast. Anyway. Prior to this change, the raw L2 multicast code was bogus, due to the fact that there wasn't really any way to test it using the bridge code. There were 2 issues: - A multicast PGID was allocated for each MDB entry, but it wasn't in fact programmed to hardware. It was dummy. - In fact we don't want to reserve a multicast PGID for every single MDB entry. That would be odd because we can only have ~60 PGIDs, but thousands of MDB entries. So instead, we want to reserve a multicast PGID for every single port combination for multicast traffic. And since we can have 2 (or more) MDB entries delivered to the same port group (and therefore PGID), we need to reference-count the PGIDs. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-10-29 02:27:38 +00:00
INIT_LIST_HEAD(&ocelot->pgids);
net: mscc: ocelot: convert the VLAN masks to a list First and foremost, the driver currently allocates a constant sized 4K * u32 (16KB memory) array for the VLAN masks. However, a typical application might not need so many VLANs, so if we dynamically allocate the memory as needed, we might actually save some space. Secondly, we'll need to keep more advanced bookkeeping of the VLANs we have, notably we'll have to check how many untagged and how many tagged VLANs we have. This will have to stay in a structure, and allocating another 16 KB array for that is again a bit too much. So refactor the bridge VLANs in a linked list of structures. The hook points inside the driver are ocelot_vlan_member_add() and ocelot_vlan_member_del(), which previously used to operate on the ocelot->vlan_mask[vid] array element. ocelot_vlan_member_add() and ocelot_vlan_member_del() used to call ocelot_vlan_member_set() to commit to the ocelot->vlan_mask. Additionally, we had two calls to ocelot_vlan_member_set() from outside those callers, and those were directly from ocelot_vlan_init(). Those calls do not set up bridging service VLANs, instead they: - clear the VLAN table on reset - set the port pvid to the value used by this driver for VLAN-unaware standalone port operation (VID 0) So now, when we have a structure which represents actual bridge VLANs, VID 0 doesn't belong in that structure, since it is not part of the bridging layer. So delete the middle man, ocelot_vlan_member_set(), and let ocelot_vlan_init() call directly ocelot_vlant_set_mask() which forgoes any data structure and writes directly to hardware, which is all that we need. Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2021-10-20 17:58:49 +00:00
INIT_LIST_HEAD(&ocelot->vlans);
INIT_LIST_HEAD(&ocelot->lag_fdbs);
ocelot_detect_features(ocelot);
ocelot_mact_init(ocelot);
ocelot_vlan_init(ocelot);
ocelot_vcap_init(ocelot);
ocelot_cpu_port_init(ocelot);
if (ocelot->ops->psfp_init)
ocelot->ops->psfp_init(ocelot);
if (ocelot->mm_supported) {
ret = ocelot_mm_init(ocelot);
if (ret)
goto err_mm_init;
}
for (port = 0; port < ocelot->num_phys_ports; port++) {
/* Clear all counters (5 groups) */
ocelot_write(ocelot, SYS_STAT_CFG_STAT_VIEW(port) |
SYS_STAT_CFG_STAT_CLEAR_SHOT(0x7f),
SYS_STAT_CFG);
}
/* Only use S-Tag */
ocelot_write(ocelot, ETH_P_8021AD, SYS_VLAN_ETYPE_CFG);
/* Aggregation mode */
ocelot_write(ocelot, ANA_AGGR_CFG_AC_SMAC_ENA |
ANA_AGGR_CFG_AC_DMAC_ENA |
ANA_AGGR_CFG_AC_IP4_SIPDIP_ENA |
ANA_AGGR_CFG_AC_IP4_TCPUDP_ENA |
ANA_AGGR_CFG_AC_IP6_FLOW_LBL_ENA |
ANA_AGGR_CFG_AC_IP6_TCPUDP_ENA,
ANA_AGGR_CFG);
/* Set MAC age time to default value. The entry is aged after
* 2*AGE_PERIOD
*/
ocelot_write(ocelot,
ANA_AUTOAGE_AGE_PERIOD(BR_DEFAULT_AGEING_TIME / 2 / HZ),
ANA_AUTOAGE);
/* Disable learning for frames discarded by VLAN ingress filtering */
regmap_field_write(ocelot->regfields[ANA_ADVLEARN_VLAN_CHK], 1);
/* Setup frame ageing - fixed value "2 sec" - in 6.5 us units */
ocelot_write(ocelot, SYS_FRM_AGING_AGE_TX_ENA |
SYS_FRM_AGING_MAX_AGE(307692), SYS_FRM_AGING);
/* Setup flooding PGIDs */
net: mscc: ocelot: fix dropping of unknown IPv4 multicast on Seville The current assumption is that the felix DSA driver has flooding knobs per traffic class, while ocelot switchdev has a single flooding knob. This was correct for felix VSC9959 and ocelot VSC7514, but with the introduction of seville VSC9953, we see a switch driven by felix.c which has a single flooding knob. So it is clear that we must do what should have been done from the beginning, which is not to overwrite the configuration done by ocelot.c in felix, but instead to teach the common ocelot library about the differences in our switches, and set up the flooding PGIDs centrally. The effect that the bogus iteration through FELIX_NUM_TC has upon seville is quite dramatic. ANA_FLOODING is located at 0x00b548, and ANA_FLOODING_IPMC is located at 0x00b54c. So the bogus iteration will actually overwrite ANA_FLOODING_IPMC when attempting to write ANA_FLOODING[1]. There is no ANA_FLOODING[1] in sevile, just ANA_FLOODING. And when ANA_FLOODING_IPMC is overwritten with a bogus value, the effect is that ANA_FLOODING_IPMC gets the value of 0x0003CF7D: MC6_DATA = 61, MC6_CTRL = 61, MC4_DATA = 60, MC4_CTRL = 0. Because MC4_CTRL is zero, this means that IPv4 multicast control packets are not flooded, but dropped. An invalid configuration, and this is how the issue was actually spotted. Reported-by: Eldar Gasanov <eldargasanov2@gmail.com> Reported-by: Maxim Kochetkov <fido_max@inbox.ru> Tested-by: Eldar Gasanov <eldargasanov2@gmail.com> Fixes: 84705fc16552 ("net: dsa: felix: introduce support for Seville VSC9953 switch") Fixes: 3c7b51bd39b2 ("net: dsa: felix: allow flooding for all traffic classes") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com> Link: https://lore.kernel.org/r/20201204175416.1445937-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-12-04 17:54:16 +00:00
for (i = 0; i < ocelot->num_flooding_pgids; i++)
ocelot_write_rix(ocelot, ANA_FLOODING_FLD_MULTICAST(PGID_MC) |
ANA_FLOODING_FLD_BROADCAST(PGID_BC) |
net: mscc: ocelot: fix dropping of unknown IPv4 multicast on Seville The current assumption is that the felix DSA driver has flooding knobs per traffic class, while ocelot switchdev has a single flooding knob. This was correct for felix VSC9959 and ocelot VSC7514, but with the introduction of seville VSC9953, we see a switch driven by felix.c which has a single flooding knob. So it is clear that we must do what should have been done from the beginning, which is not to overwrite the configuration done by ocelot.c in felix, but instead to teach the common ocelot library about the differences in our switches, and set up the flooding PGIDs centrally. The effect that the bogus iteration through FELIX_NUM_TC has upon seville is quite dramatic. ANA_FLOODING is located at 0x00b548, and ANA_FLOODING_IPMC is located at 0x00b54c. So the bogus iteration will actually overwrite ANA_FLOODING_IPMC when attempting to write ANA_FLOODING[1]. There is no ANA_FLOODING[1] in sevile, just ANA_FLOODING. And when ANA_FLOODING_IPMC is overwritten with a bogus value, the effect is that ANA_FLOODING_IPMC gets the value of 0x0003CF7D: MC6_DATA = 61, MC6_CTRL = 61, MC4_DATA = 60, MC4_CTRL = 0. Because MC4_CTRL is zero, this means that IPv4 multicast control packets are not flooded, but dropped. An invalid configuration, and this is how the issue was actually spotted. Reported-by: Eldar Gasanov <eldargasanov2@gmail.com> Reported-by: Maxim Kochetkov <fido_max@inbox.ru> Tested-by: Eldar Gasanov <eldargasanov2@gmail.com> Fixes: 84705fc16552 ("net: dsa: felix: introduce support for Seville VSC9953 switch") Fixes: 3c7b51bd39b2 ("net: dsa: felix: allow flooding for all traffic classes") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com> Link: https://lore.kernel.org/r/20201204175416.1445937-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2020-12-04 17:54:16 +00:00
ANA_FLOODING_FLD_UNICAST(PGID_UC),
ANA_FLOODING, i);
ocelot_write(ocelot, ANA_FLOODING_IPMC_FLD_MC6_DATA(PGID_MCIPV6) |
ANA_FLOODING_IPMC_FLD_MC6_CTRL(PGID_MC) |
ANA_FLOODING_IPMC_FLD_MC4_DATA(PGID_MCIPV4) |
ANA_FLOODING_IPMC_FLD_MC4_CTRL(PGID_MC),
ANA_FLOODING_IPMC);
for (port = 0; port < ocelot->num_phys_ports; port++) {
/* Transmit the frame to the local port. */
ocelot_write_rix(ocelot, BIT(port), ANA_PGID_PGID, port);
/* Do not forward BPDU frames to the front ports. */
ocelot_write_gix(ocelot,
ANA_PORT_CPU_FWD_BPDU_CFG_BPDU_REDIR_ENA(0xffff),
ANA_PORT_CPU_FWD_BPDU_CFG,
port);
/* Ensure bridging is disabled */
ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_SRC + port);
}
for_each_nonreserved_multicast_dest_pgid(ocelot, i) {
u32 val = ANA_PGID_PGID_PGID(GENMASK(ocelot->num_phys_ports - 1, 0));
ocelot_write_rix(ocelot, val, ANA_PGID_PGID, i);
}
ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_BLACKHOLE);
/* Allow broadcast and unknown L2 multicast to the CPU. */
ocelot_rmw_rix(ocelot, ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)),
ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)),
ANA_PGID_PGID, PGID_MC);
ocelot_rmw_rix(ocelot, ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)),
ANA_PGID_PGID_PGID(BIT(ocelot->num_phys_ports)),
ANA_PGID_PGID, PGID_BC);
ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV4);
ocelot_write_rix(ocelot, 0, ANA_PGID_PGID, PGID_MCIPV6);
/* Allow manual injection via DEVCPU_QS registers, and byte swap these
* registers endianness.
*/
ocelot_write_rix(ocelot, QS_INJ_GRP_CFG_BYTE_SWAP |
QS_INJ_GRP_CFG_MODE(1), QS_INJ_GRP_CFG, 0);
ocelot_write_rix(ocelot, QS_XTR_GRP_CFG_BYTE_SWAP |
QS_XTR_GRP_CFG_MODE(1), QS_XTR_GRP_CFG, 0);
ocelot_write(ocelot, ANA_CPUQ_CFG_CPUQ_MIRROR(2) |
ANA_CPUQ_CFG_CPUQ_LRN(2) |
ANA_CPUQ_CFG_CPUQ_MAC_COPY(2) |
ANA_CPUQ_CFG_CPUQ_SRC_COPY(2) |
ANA_CPUQ_CFG_CPUQ_LOCKED_PORTMOVE(2) |
ANA_CPUQ_CFG_CPUQ_ALLBRIDGE(6) |
ANA_CPUQ_CFG_CPUQ_IPMC_CTRL(6) |
ANA_CPUQ_CFG_CPUQ_IGMP(6) |
ANA_CPUQ_CFG_CPUQ_MLD(6), ANA_CPUQ_CFG);
for (i = 0; i < 16; i++)
ocelot_write_rix(ocelot, ANA_CPUQ_8021_CFG_CPUQ_GARP_VAL(6) |
ANA_CPUQ_8021_CFG_CPUQ_BPDU_VAL(6),
ANA_CPUQ_8021_CFG, i);
return 0;
err_mm_init:
ocelot_stats_deinit(ocelot);
err_stats_init:
destroy_workqueue(ocelot->owq);
return ret;
}
EXPORT_SYMBOL(ocelot_init);
void ocelot_deinit(struct ocelot *ocelot)
{
ocelot_stats_deinit(ocelot);
destroy_workqueue(ocelot->owq);
}
EXPORT_SYMBOL(ocelot_deinit);
void ocelot_deinit_port(struct ocelot *ocelot, int port)
{
struct ocelot_port *ocelot_port = ocelot->ports[port];
skb_queue_purge(&ocelot_port->tx_skbs);
}
EXPORT_SYMBOL(ocelot_deinit_port);
MODULE_LICENSE("Dual MIT/GPL");