From e509996b16728e37d5a909a5c63c1bd64f23b306 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Mon, 2 Sep 2024 17:07:09 -0700 Subject: [PATCH 01/40] xfrm: extract dst lookup parameters into a struct Preparation for adding more fields to dst lookup functions without changing their signatures. Signed-off-by: Eyal Birger Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 26 +++++++++++++------------- net/ipv4/xfrm4_policy.c | 36 +++++++++++++++--------------------- net/ipv6/xfrm6_policy.c | 28 +++++++++++++--------------- net/xfrm/xfrm_device.c | 11 ++++++++--- net/xfrm/xfrm_policy.c | 35 +++++++++++++++++++++++------------ 5 files changed, 72 insertions(+), 64 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b6bfdc6416c7..f3ae50372707 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -349,20 +349,23 @@ struct xfrm_if_cb { void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb); void xfrm_if_unregister_cb(void); +struct xfrm_dst_lookup_params { + struct net *net; + int tos; + int oif; + xfrm_address_t *saddr; + xfrm_address_t *daddr; + u32 mark; +}; + struct net_device; struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { struct dst_ops *dst_ops; - struct dst_entry *(*dst_lookup)(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark); - int (*get_saddr)(struct net *net, int oif, - xfrm_address_t *saddr, - xfrm_address_t *daddr, - u32 mark); + struct dst_entry *(*dst_lookup)(const struct xfrm_dst_lookup_params *params); + int (*get_saddr)(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params); int (*fill_dst)(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl); @@ -1764,10 +1767,7 @@ static inline int xfrm_user_policy(struct sock *sk, int optname, } #endif -struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family, u32 mark); +struct dst_entry *__xfrm_dst_lookup(int family, const struct xfrm_dst_lookup_params *params); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 0294fef577fa..ac1a28ef0c56 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,47 +17,41 @@ #include #include -static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark) +static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, + const struct xfrm_dst_lookup_params *params) { struct rtable *rt; memset(fl4, 0, sizeof(*fl4)); - fl4->daddr = daddr->a4; - fl4->flowi4_tos = tos; - fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(net, oif); - fl4->flowi4_mark = mark; - if (saddr) - fl4->saddr = saddr->a4; + fl4->daddr = params->daddr->a4; + fl4->flowi4_tos = params->tos; + fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, + params->oif); + fl4->flowi4_mark = params->mark; + if (params->saddr) + fl4->saddr = params->saddr->a4; - rt = __ip_route_output_key(net, fl4); + rt = __ip_route_output_key(params->net, fl4); if (!IS_ERR(rt)) return &rt->dst; return ERR_CAST(rt); } -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark) +static struct dst_entry *xfrm4_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); + return __xfrm4_dst_lookup(&fl4, params); } -static int xfrm4_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr, - u32 mark) +static int xfrm4_get_saddr(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); + dst = __xfrm4_dst_lookup(&fl4, params); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index b1d81c4270ab..fc3f5eec6898 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,23 +23,21 @@ #include #include -static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - u32 mark) +static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params) { struct flowi6 fl6; struct dst_entry *dst; int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(net, oif); - fl6.flowi6_mark = mark; - memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); - if (saddr) - memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); + fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net, + params->oif); + fl6.flowi6_mark = params->mark; + memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr)); + if (params->saddr) + memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); - dst = ip6_route_output(net, NULL, &fl6); + dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; if (dst->error) { @@ -50,15 +48,14 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, return dst; } -static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr, - u32 mark) +static int xfrm6_get_saddr(xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { struct dst_entry *dst; struct net_device *dev; struct inet6_dev *idev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); + dst = xfrm6_dst_lookup(params); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -68,7 +65,8 @@ static int xfrm6_get_saddr(struct net *net, int oif, return -EHOSTUNREACH; } dev = idev->dev; - ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6); + ipv6_dev_get_saddr(dev_net(dev), dev, ¶ms->daddr->in6, 0, + &saddr->in6); dst_release(dst); return 0; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index f123b7c9ec82..b33c4591e09a 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -269,6 +269,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { + struct xfrm_dst_lookup_params params; + if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { saddr = &x->props.saddr; daddr = &x->id.daddr; @@ -277,9 +279,12 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, daddr = &x->props.saddr; } - dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, - x->props.family, - xfrm_smark_get(0, x)); + memset(¶ms, 0, sizeof(params)); + params.net = net; + params.saddr = saddr; + params.daddr = daddr; + params.mark = xfrm_smark_get(0, x); + dst = __xfrm_dst_lookup(x->props.family, ¶ms); if (IS_ERR(dst)) return (is_packet_offload) ? -EINVAL : 0; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 914bac03b52a..ec8f2fe13a51 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -270,10 +270,8 @@ static const struct xfrm_if_cb *xfrm_if_get_cb(void) return rcu_dereference(xfrm_if_cb); } -struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family, u32 mark) +struct dst_entry *__xfrm_dst_lookup(int family, + const struct xfrm_dst_lookup_params *params) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -282,7 +280,7 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); + dst = afinfo->dst_lookup(params); rcu_read_unlock(); @@ -296,6 +294,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, xfrm_address_t *prev_daddr, int family, u32 mark) { + struct xfrm_dst_lookup_params params; struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; @@ -310,7 +309,14 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); + params.net = net; + params.saddr = saddr; + params.daddr = daddr; + params.tos = tos; + params.oif = oif; + params.mark = mark; + + dst = __xfrm_dst_lookup(family, ¶ms); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -2432,15 +2438,15 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) } static int -xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, - xfrm_address_t *remote, unsigned short family, u32 mark) +xfrm_get_saddr(unsigned short family, xfrm_address_t *saddr, + const struct xfrm_dst_lookup_params *params) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, oif, local, remote, mark); + err = afinfo->get_saddr(saddr, params); rcu_read_unlock(); return err; } @@ -2469,9 +2475,14 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, fl->flowi_oif, - &tmp, remote, - tmpl->encap_family, 0); + struct xfrm_dst_lookup_params params; + + memset(¶ms, 0, sizeof(params)); + params.net = net; + params.oif = fl->flowi_oif; + params.daddr = remote; + error = xfrm_get_saddr(tmpl->encap_family, &tmp, + ¶ms); if (error) goto fail; local = &tmp; From b8469721034300bbb6dec5b4bf32492c95e16a0c Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Mon, 2 Sep 2024 17:07:10 -0700 Subject: [PATCH 02/40] xfrm: respect ip protocols rules criteria when performing dst lookups The series in the "fixes" tag added the ability to consider L4 attributes in routing rules. The dst lookup on the outer packet of encapsulated traffic in the xfrm code was not adapted to this change, thus routing behavior that relies on L4 information is not respected. Pass the ip protocol information when performing dst lookups. Fixes: a25724b05af0 ("Merge branch 'fib_rules-support-sport-dport-and-proto-match'") Signed-off-by: Eyal Birger Tested-by: Antony Antony Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 ++ net/ipv4/xfrm4_policy.c | 2 ++ net/ipv6/xfrm6_policy.c | 3 +++ net/xfrm/xfrm_policy.c | 15 +++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f3ae50372707..a0bdd58f401c 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -356,6 +356,8 @@ struct xfrm_dst_lookup_params { xfrm_address_t *saddr; xfrm_address_t *daddr; u32 mark; + __u8 ipproto; + union flowi_uli uli; }; struct net_device; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index ac1a28ef0c56..7e1c2faed1ff 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -30,6 +30,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, fl4->flowi4_mark = params->mark; if (params->saddr) fl4->saddr = params->saddr->a4; + fl4->flowi4_proto = params->ipproto; + fl4->uli = params->uli; rt = __ip_route_output_key(params->net, fl4); if (!IS_ERR(rt)) diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index fc3f5eec6898..1f19b6f14484 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -37,6 +37,9 @@ static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *p if (params->saddr) memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr)); + fl6.flowi4_proto = params->ipproto; + fl6.uli = params->uli; + dst = ip6_route_output(params->net, NULL, &fl6); err = dst->error; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ec8f2fe13a51..55cc9f4cb42b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -315,6 +315,21 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, params.tos = tos; params.oif = oif; params.mark = mark; + params.ipproto = x->id.proto; + if (x->encap) { + switch (x->encap->encap_type) { + case UDP_ENCAP_ESPINUDP: + params.ipproto = IPPROTO_UDP; + params.uli.ports.sport = x->encap->encap_sport; + params.uli.ports.dport = x->encap->encap_dport; + break; + case TCP_ENCAP_ESPINTCP: + params.ipproto = IPPROTO_TCP; + params.uli.ports.sport = x->encap->encap_sport; + params.uli.ports.dport = x->encap->encap_dport; + break; + } + } dst = __xfrm_dst_lookup(family, ¶ms); From 645546a05b0370391c0eac0f14f5b9ddf8d00731 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 18 Sep 2024 11:12:49 +0200 Subject: [PATCH 03/40] xfrm: policy: remove last remnants of pernet inexact list xfrm_net still contained the no-longer-used inexact policy list heads, remove them. Fixes: a54ad727f745 ("xfrm: policy: remove remaining use of inexact list") Signed-off-by: Florian Westphal Signed-off-by: Steffen Klassert --- include/net/netns/xfrm.h | 1 - net/xfrm/xfrm_policy.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index d489d9250bff..ae60d6664095 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -51,7 +51,6 @@ struct netns_xfrm { struct hlist_head *policy_byidx; unsigned int policy_idx_hmask; unsigned int idx_generator; - struct hlist_head policy_inexact[XFRM_POLICY_MAX]; struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; unsigned int policy_count[XFRM_POLICY_MAX * 2]; struct work_struct policy_hash_work; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 55cc9f4cb42b..a2ea9dbac90b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4206,7 +4206,6 @@ static int __net_init xfrm_policy_init(struct net *net) net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; - INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); @@ -4260,8 +4259,6 @@ static void xfrm_policy_fini(struct net *net) for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; - WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); - htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); From 3f0ab59e6537c6a8f9e1b355b48f9c05a76e8563 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 1 Oct 2024 18:48:14 +0200 Subject: [PATCH 04/40] xfrm: validate new SA's prefixlen using SA family when sel.family is unset This expands the validation introduced in commit 07bf7908950a ("xfrm: Validate address prefix lengths in the xfrm selector.") syzbot created an SA with usersa.sel.family = AF_UNSPEC usersa.sel.prefixlen_s = 128 usersa.family = AF_INET Because of the AF_UNSPEC selector, verify_newsa_info doesn't put limits on prefixlen_{s,d}. But then copy_from_user_state sets x->sel.family to usersa.family (AF_INET). Do the same conversion in verify_newsa_info before validating prefixlen_{s,d}, since that's how prefixlen is going to be used later on. Reported-by: syzbot+cc39f136925517aed571@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 55f039ec3d59..8d06a37adbd9 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -201,6 +201,7 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, { int err; u8 sa_dir = attrs[XFRMA_SA_DIR] ? nla_get_u8(attrs[XFRMA_SA_DIR]) : 0; + u16 family = p->sel.family; err = -EINVAL; switch (p->family) { @@ -221,7 +222,10 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; } - switch (p->sel.family) { + if (!family && !(p->flags & XFRM_STATE_AF_UNSPEC)) + family = p->family; + + switch (family) { case AF_UNSPEC: break; From 6889cd2a93e1e3606b3f6e958aa0924e836de4d2 Mon Sep 17 00:00:00 2001 From: Petr Vaganov Date: Tue, 8 Oct 2024 14:02:58 +0500 Subject: [PATCH 05/40] xfrm: fix one more kernel-infoleak in algo dumping During fuzz testing, the following issue was discovered: BUG: KMSAN: kernel-infoleak in _copy_to_iter+0x598/0x2a30 _copy_to_iter+0x598/0x2a30 __skb_datagram_iter+0x168/0x1060 skb_copy_datagram_iter+0x5b/0x220 netlink_recvmsg+0x362/0x1700 sock_recvmsg+0x2dc/0x390 __sys_recvfrom+0x381/0x6d0 __x64_sys_recvfrom+0x130/0x200 x64_sys_call+0x32c8/0x3cc0 do_syscall_64+0xd8/0x1c0 entry_SYSCALL_64_after_hwframe+0x79/0x81 Uninit was stored to memory at: copy_to_user_state_extra+0xcc1/0x1e00 dump_one_state+0x28c/0x5f0 xfrm_state_walk+0x548/0x11e0 xfrm_dump_sa+0x1e0/0x840 netlink_dump+0x943/0x1c40 __netlink_dump_start+0x746/0xdb0 xfrm_user_rcv_msg+0x429/0xc00 netlink_rcv_skb+0x613/0x780 xfrm_netlink_rcv+0x77/0xc0 netlink_unicast+0xe90/0x1280 netlink_sendmsg+0x126d/0x1490 __sock_sendmsg+0x332/0x3d0 ____sys_sendmsg+0x863/0xc30 ___sys_sendmsg+0x285/0x3e0 __x64_sys_sendmsg+0x2d6/0x560 x64_sys_call+0x1316/0x3cc0 do_syscall_64+0xd8/0x1c0 entry_SYSCALL_64_after_hwframe+0x79/0x81 Uninit was created at: __kmalloc+0x571/0xd30 attach_auth+0x106/0x3e0 xfrm_add_sa+0x2aa0/0x4230 xfrm_user_rcv_msg+0x832/0xc00 netlink_rcv_skb+0x613/0x780 xfrm_netlink_rcv+0x77/0xc0 netlink_unicast+0xe90/0x1280 netlink_sendmsg+0x126d/0x1490 __sock_sendmsg+0x332/0x3d0 ____sys_sendmsg+0x863/0xc30 ___sys_sendmsg+0x285/0x3e0 __x64_sys_sendmsg+0x2d6/0x560 x64_sys_call+0x1316/0x3cc0 do_syscall_64+0xd8/0x1c0 entry_SYSCALL_64_after_hwframe+0x79/0x81 Bytes 328-379 of 732 are uninitialized Memory access of size 732 starts at ffff88800e18e000 Data copied to user address 00007ff30f48aff0 CPU: 2 PID: 18167 Comm: syz-executor.0 Not tainted 6.8.11 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Fixes copying of xfrm algorithms where some random data of the structure fields can end up in userspace. Padding in structures may be filled with random (possibly sensitve) data and should never be given directly to user-space. A similar issue was resolved in the commit 8222d5910dae ("xfrm: Zero padding when dumping algos and encap") Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: c7a5899eb26e ("xfrm: redact SA secret with lockdown confidentiality") Cc: stable@vger.kernel.org Co-developed-by: Boris Tonofa Signed-off-by: Boris Tonofa Signed-off-by: Petr Vaganov Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8d06a37adbd9..eb146f574bbd 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1102,7 +1102,9 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; ap = nla_data(nla); - memcpy(ap, auth, sizeof(struct xfrm_algo_auth)); + strscpy_pad(ap->alg_name, auth->alg_name, sizeof(ap->alg_name)); + ap->alg_key_len = auth->alg_key_len; + ap->alg_trunc_len = auth->alg_trunc_len; if (redact_secret && auth->alg_key_len) memset(ap->alg_key, 0, (auth->alg_key_len + 7) / 8); else From 1230fe7ad3974f7bf6c78901473e039b34d4fb1f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 10 Oct 2024 18:34:05 +0200 Subject: [PATCH 06/40] netfilter: bpf: must hold reference on net namespace BUG: KASAN: slab-use-after-free in __nf_unregister_net_hook+0x640/0x6b0 Read of size 8 at addr ffff8880106fe400 by task repro/72= bpf_nf_link_release+0xda/0x1e0 bpf_link_free+0x139/0x2d0 bpf_link_release+0x68/0x80 __fput+0x414/0xb60 Eric says: It seems that bpf was able to defer the __nf_unregister_net_hook() after exit()/close() time. Perhaps a netns reference is missing, because the netns has been dismantled/freed already. bpf_nf_link_attach() does : link->net = net; But I do not see a reference being taken on net. Add such a reference and release it after hook unreg. Note that I was unable to get syzbot reproducer to work, so I do not know if this resolves this splat. Fixes: 84601d6ee68a ("bpf: add bpf_link support for BPF_NETFILTER programs") Diagnosed-by: Eric Dumazet Reported-by: Lai, Yi Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_bpf_link.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 5257d5e7eb09..e5e79a08c10b 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -23,6 +23,7 @@ static unsigned int nf_hook_run_bpf(void *bpf_prog, struct sk_buff *skb, struct bpf_nf_link { struct bpf_link link; struct nf_hook_ops hook_ops; + netns_tracker ns_tracker; struct net *net; u32 dead; const struct nf_defrag_hook *defrag_hook; @@ -120,6 +121,7 @@ static void bpf_nf_link_release(struct bpf_link *link) if (!cmpxchg(&nf_link->dead, 0, 1)) { nf_unregister_net_hook(nf_link->net, &nf_link->hook_ops); bpf_nf_disable_defrag(nf_link); + put_net_track(nf_link->net, &nf_link->ns_tracker); } } @@ -257,6 +259,8 @@ int bpf_nf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) return err; } + get_net_track(net, &link->ns_tracker, GFP_KERNEL); + return bpf_link_settle(&link_primer); } From f2767a41959e60763949c73ee180e40c686e807e Mon Sep 17 00:00:00 2001 From: Kory Maincent Date: Tue, 15 Oct 2024 15:02:54 +0200 Subject: [PATCH 07/40] net: pse-pd: Fix out of bound for loop Adjust the loop limit to prevent out-of-bounds access when iterating over PI structures. The loop should not reach the index pcdev->nr_lines since we allocate exactly pcdev->nr_lines number of PI structures. This fix ensures proper bounds are maintained during iterations. Fixes: 9be9567a7c59 ("net: pse-pd: Add support for PSE PIs") Signed-off-by: Kory Maincent Reviewed-by: Simon Horman Acked-by: Oleksij Rempel Message-ID: <20241015130255.125508-1-kory.maincent@bootlin.com> Signed-off-by: Andrew Lunn --- drivers/net/pse-pd/pse_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c index f8e6854781e6..2906ce173f66 100644 --- a/drivers/net/pse-pd/pse_core.c +++ b/drivers/net/pse-pd/pse_core.c @@ -113,7 +113,7 @@ static void pse_release_pis(struct pse_controller_dev *pcdev) { int i; - for (i = 0; i <= pcdev->nr_lines; i++) { + for (i = 0; i < pcdev->nr_lines; i++) { of_node_put(pcdev->pi[i].pairset[0].np); of_node_put(pcdev->pi[i].pairset[1].np); of_node_put(pcdev->pi[i].np); @@ -647,7 +647,7 @@ static int of_pse_match_pi(struct pse_controller_dev *pcdev, { int i; - for (i = 0; i <= pcdev->nr_lines; i++) { + for (i = 0; i < pcdev->nr_lines; i++) { if (pcdev->pi[i].np == np) return i; } From 2cb3f56e827abb22c4168ad0c1bbbf401bb2f3b8 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 15 Oct 2024 22:41:48 +0800 Subject: [PATCH 08/40] net/sun3_82586: fix potential memory leak in sun3_82586_send_packet() The sun3_82586_send_packet() returns NETDEV_TX_OK without freeing skb in case of skb->len being too long, add dev_kfree_skb() to fix it. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Wang Hai Reviewed-by: Simon Horman Message-ID: <20241015144148.7918-1-wanghai38@huawei.com> Signed-off-by: Andrew Lunn --- drivers/net/ethernet/i825xx/sun3_82586.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index f2d4669c81cf..58a3d28d938c 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -1012,6 +1012,7 @@ sun3_82586_send_packet(struct sk_buff *skb, struct net_device *dev) if(skb->len > XMIT_BUFF_SIZE) { printk("%s: Sorry, max. framelength is %d bytes. The length of your frame is %d bytes.\n",dev->name,XMIT_BUFF_SIZE,skb->len); + dev_kfree_skb(skb); return NETDEV_TX_OK; } From e4dd8bfe0f6a23acd305f9b892c00899089bd621 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 15 Oct 2024 22:48:02 +0800 Subject: [PATCH 09/40] be2net: fix potential memory leak in be_xmit() The be_xmit() returns NETDEV_TX_OK without freeing skb in case of be_xmit_enqueue() fails, add dev_kfree_skb_any() to fix it. Fixes: 760c295e0e8d ("be2net: Support for OS2BMC.") Signed-off-by: Wang Hai Reviewed-by: Simon Horman Reviewed-by: Kalesh AP Message-ID: <20241015144802.12150-1-wanghai38@huawei.com> Signed-off-by: Andrew Lunn --- drivers/net/ethernet/emulex/benet/be_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index a8596ebcdfd6..875fe379eea2 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -1381,10 +1381,8 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) be_get_wrb_params_from_skb(adapter, skb, &wrb_params); wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params); - if (unlikely(!wrb_cnt)) { - dev_kfree_skb_any(skb); - goto drop; - } + if (unlikely(!wrb_cnt)) + goto drop_skb; /* if os2bmc is enabled and if the pkt is destined to bmc, * enqueue the pkt a 2nd time with mgmt bit set. @@ -1393,7 +1391,7 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) BE_WRB_F_SET(wrb_params.features, OS2BMC, 1); wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params); if (unlikely(!wrb_cnt)) - goto drop; + goto drop_skb; else skb_get(skb); } @@ -1407,6 +1405,8 @@ static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev) be_xmit_flush(adapter, txo); return NETDEV_TX_OK; +drop_skb: + dev_kfree_skb_any(skb); drop: tx_stats(txo)->tx_drv_drops++; /* Flush the already enqueued tx requests */ From f99cf996ba5a315f8b9f13cc21dff0604a0eb749 Mon Sep 17 00:00:00 2001 From: Jakub Boehm Date: Tue, 15 Oct 2024 17:16:04 +0200 Subject: [PATCH 10/40] net: plip: fix break; causing plip to never transmit Since commit 71ae2cb30531 ("net: plip: Fix fall-through warnings for Clang") plip was not able to send any packets, this patch replaces one unintended break; with fallthrough; which was originally missed by commit 9525d69a3667 ("net: plip: mark expected switch fall-throughs"). I have verified with a real hardware PLIP connection that everything works once again after applying this patch. Fixes: 71ae2cb30531 ("net: plip: Fix fall-through warnings for Clang") Signed-off-by: Jakub Boehm Reviewed-by: Simon Horman Message-ID: <20241015-net-plip-tx-fix-v1-1-32d8be1c7e0b@gmail.com> Signed-off-by: Andrew Lunn --- drivers/net/plip/plip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c index e39bfaefe8c5..d81163bc910a 100644 --- a/drivers/net/plip/plip.c +++ b/drivers/net/plip/plip.c @@ -815,7 +815,7 @@ plip_send_packet(struct net_device *dev, struct net_local *nl, return HS_TIMEOUT; } } - break; + fallthrough; case PLIP_PK_LENGTH_LSB: if (plip_send(nibble_timeout, dev, From 9f86df0e7537c31b901b7448cf1c30f4fc1afc41 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Oct 2024 08:30:05 -0700 Subject: [PATCH 11/40] MAINTAINERS: add Simon as an official reviewer Simon has been diligently and consistently reviewing networking changes for at least as long as our development statistics go back. Often if not usually topping the list of reviewers. Make his role official. Acked-by: Eric Dumazet Acked-by: Paolo Abeni Signed-off-by: Jakub Kicinski Reviewed-by: Simon Horman Message-ID: <20241015153005.2854018-1-kuba@kernel.org> Signed-off-by: Andrew Lunn --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 497fdbe537de..86f64a2c7e1b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16152,6 +16152,7 @@ M: "David S. Miller" M: Eric Dumazet M: Jakub Kicinski M: Paolo Abeni +R: Simon Horman L: netdev@vger.kernel.org S: Maintained P: Documentation/process/maintainer-netdev.rst From de96f6a3003513c796bbe4e23210a446913f5c00 Mon Sep 17 00:00:00 2001 From: Michel Alex Date: Wed, 16 Oct 2024 12:11:15 +0000 Subject: [PATCH 12/40] net: phy: dp83822: Fix reset pin definitions This change fixes a rare issue where the PHY fails to detect a link due to incorrect reset behavior. The SW_RESET definition was incorrectly assigned to bit 14, which is the Digital Restart bit according to the datasheet. This commit corrects SW_RESET to bit 15 and assigns DIG_RESTART to bit 14 as per the datasheet specifications. The SW_RESET define is only used in the phy_reset function, which fully re-initializes the PHY after the reset is performed. The change in the bit definitions should not have any negative impact on the functionality of the PHY. v2: - added Fixes tag - improved commit message Cc: stable@vger.kernel.org Fixes: 5dc39fd5ef35 ("net: phy: DP83822: Add ability to advertise Fiber connection") Signed-off-by: Alex Michel Reviewed-by: Andrew Lunn Message-ID: Signed-off-by: Andrew Lunn --- drivers/net/phy/dp83822.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index fc247f479257..3ab64e04a01c 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -45,8 +45,8 @@ /* Control Register 2 bits */ #define DP83822_FX_ENABLE BIT(14) -#define DP83822_HW_RESET BIT(15) -#define DP83822_SW_RESET BIT(14) +#define DP83822_SW_RESET BIT(15) +#define DP83822_DIG_RESTART BIT(14) /* PHY STS bits */ #define DP83822_PHYSTS_DUPLEX BIT(2) From 4ab3e4983bcc9d9b9dd9720253cb93f44e9e657c Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Wed, 16 Oct 2024 12:52:34 -0700 Subject: [PATCH 13/40] bnxt_en: replace ptp_lock with irqsave variant In netpoll configuration the completion processing can happen in hard irq context which will break with spin_lock_bh() for fullfilling RX timestamp in case of all packets timestamping. Replace it with spin_lock_irqsave() variant. Fixes: 7f5515d19cd7 ("bnxt_en: Get the RX packet timestamp") Reviewed-by: Michael Chan Signed-off-by: Vadim Fedorenko Message-ID: <20241016195234.2622004-1-vadfed@meta.com> Signed-off-by: Andrew Lunn --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 22 +++--- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 70 +++++++++++-------- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 12 ++-- 3 files changed, 63 insertions(+), 41 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 6e422e24750a..99d025b69079 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2254,10 +2254,11 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, if (!bnxt_get_rx_ts_p5(bp, &ts, cmpl_ts)) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; + unsigned long flags; - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); ns = timecounter_cyc2time(&ptp->tc, ts); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); memset(skb_hwtstamps(skb), 0, sizeof(*skb_hwtstamps(skb))); skb_hwtstamps(skb)->hwtstamp = ns_to_ktime(ns); @@ -2757,17 +2758,18 @@ static int bnxt_async_event_process(struct bnxt *bp, case ASYNC_EVENT_CMPL_PHC_UPDATE_EVENT_DATA1_FLAGS_PHC_RTC_UPDATE: if (BNXT_PTP_USE_RTC(bp)) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; + unsigned long flags; u64 ns; if (!ptp) goto async_event_process_exit; - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); bnxt_ptp_update_current_time(bp); ns = (((u64)BNXT_EVENT_PHC_RTC_UPDATE(data1) << BNXT_PHC_BITS) | ptp->current_time); bnxt_ptp_rtc_timecounter_init(ptp, ns); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); } break; } @@ -13494,9 +13496,11 @@ static void bnxt_force_fw_reset(struct bnxt *bp) return; if (ptp) { - spin_lock_bh(&ptp->ptp_lock); + unsigned long flags; + + spin_lock_irqsave(&ptp->ptp_lock, flags); set_bit(BNXT_STATE_IN_FW_RESET, &bp->state); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); } else { set_bit(BNXT_STATE_IN_FW_RESET, &bp->state); } @@ -13561,9 +13565,11 @@ void bnxt_fw_reset(struct bnxt *bp) int n = 0, tmo; if (ptp) { - spin_lock_bh(&ptp->ptp_lock); + unsigned long flags; + + spin_lock_irqsave(&ptp->ptp_lock, flags); set_bit(BNXT_STATE_IN_FW_RESET, &bp->state); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); } else { set_bit(BNXT_STATE_IN_FW_RESET, &bp->state); } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index 37d42423459c..fa514be87650 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -62,13 +62,14 @@ static int bnxt_ptp_settime(struct ptp_clock_info *ptp_info, struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); u64 ns = timespec64_to_ns(ts); + unsigned long flags; if (BNXT_PTP_USE_RTC(ptp->bp)) return bnxt_ptp_cfg_settime(ptp->bp, ns); - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); timecounter_init(&ptp->tc, &ptp->cc, ns); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); return 0; } @@ -100,13 +101,14 @@ static int bnxt_refclk_read(struct bnxt *bp, struct ptp_system_timestamp *sts, static void bnxt_ptp_get_current_time(struct bnxt *bp) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; + unsigned long flags; if (!ptp) return; - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); WRITE_ONCE(ptp->old_time, ptp->current_time); bnxt_refclk_read(bp, NULL, &ptp->current_time); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); } static int bnxt_hwrm_port_ts_query(struct bnxt *bp, u32 flags, u64 *ts, @@ -149,17 +151,18 @@ static int bnxt_ptp_gettimex(struct ptp_clock_info *ptp_info, { struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); + unsigned long flags; u64 ns, cycles; int rc; - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); rc = bnxt_refclk_read(ptp->bp, sts, &cycles); if (rc) { - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); return rc; } ns = timecounter_cyc2time(&ptp->tc, cycles); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); *ts = ns_to_timespec64(ns); return 0; @@ -177,6 +180,7 @@ void bnxt_ptp_update_current_time(struct bnxt *bp) static int bnxt_ptp_adjphc(struct bnxt_ptp_cfg *ptp, s64 delta) { struct hwrm_port_mac_cfg_input *req; + unsigned long flags; int rc; rc = hwrm_req_init(ptp->bp, req, HWRM_PORT_MAC_CFG); @@ -190,9 +194,9 @@ static int bnxt_ptp_adjphc(struct bnxt_ptp_cfg *ptp, s64 delta) if (rc) { netdev_err(ptp->bp->dev, "ptp adjphc failed. rc = %x\n", rc); } else { - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); bnxt_ptp_update_current_time(ptp->bp); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); } return rc; @@ -202,13 +206,14 @@ static int bnxt_ptp_adjtime(struct ptp_clock_info *ptp_info, s64 delta) { struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); + unsigned long flags; if (BNXT_PTP_USE_RTC(ptp->bp)) return bnxt_ptp_adjphc(ptp, delta); - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); timecounter_adjtime(&ptp->tc, delta); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); return 0; } @@ -236,14 +241,15 @@ static int bnxt_ptp_adjfine(struct ptp_clock_info *ptp_info, long scaled_ppm) struct bnxt_ptp_cfg *ptp = container_of(ptp_info, struct bnxt_ptp_cfg, ptp_info); struct bnxt *bp = ptp->bp; + unsigned long flags; if (!BNXT_MH(bp)) return bnxt_ptp_adjfine_rtc(bp, scaled_ppm); - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); timecounter_read(&ptp->tc); ptp->cc.mult = adjust_by_scaled_ppm(ptp->cmult, scaled_ppm); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); return 0; } @@ -251,12 +257,13 @@ void bnxt_ptp_pps_event(struct bnxt *bp, u32 data1, u32 data2) { struct bnxt_ptp_cfg *ptp = bp->ptp_cfg; struct ptp_clock_event event; + unsigned long flags; u64 ns, pps_ts; pps_ts = EVENT_PPS_TS(data2, data1); - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); ns = timecounter_cyc2time(&ptp->tc, pps_ts); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); switch (EVENT_DATA2_PPS_EVENT_TYPE(data2)) { case ASYNC_EVENT_CMPL_PPS_TIMESTAMP_EVENT_DATA2_EVENT_TYPE_INTERNAL: @@ -393,16 +400,17 @@ static int bnxt_get_target_cycles(struct bnxt_ptp_cfg *ptp, u64 target_ns, { u64 cycles_now; u64 nsec_now, nsec_delta; + unsigned long flags; int rc; - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); rc = bnxt_refclk_read(ptp->bp, NULL, &cycles_now); if (rc) { - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); return rc; } nsec_now = timecounter_cyc2time(&ptp->tc, cycles_now); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); nsec_delta = target_ns - nsec_now; *cycles_delta = div64_u64(nsec_delta << ptp->cc.shift, ptp->cc.mult); @@ -689,6 +697,7 @@ static int bnxt_stamp_tx_skb(struct bnxt *bp, int slot) struct skb_shared_hwtstamps timestamp; struct bnxt_ptp_tx_req *txts_req; unsigned long now = jiffies; + unsigned long flags; u64 ts = 0, ns = 0; u32 tmo = 0; int rc; @@ -702,9 +711,9 @@ static int bnxt_stamp_tx_skb(struct bnxt *bp, int slot) tmo, slot); if (!rc) { memset(×tamp, 0, sizeof(timestamp)); - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); ns = timecounter_cyc2time(&ptp->tc, ts); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); timestamp.hwtstamp = ns_to_ktime(ns); skb_tstamp_tx(txts_req->tx_skb, ×tamp); ptp->stats.ts_pkts++; @@ -730,6 +739,7 @@ static long bnxt_ptp_ts_aux_work(struct ptp_clock_info *ptp_info) unsigned long now = jiffies; struct bnxt *bp = ptp->bp; u16 cons = ptp->txts_cons; + unsigned long flags; u32 num_requests; int rc = 0; @@ -757,9 +767,9 @@ static long bnxt_ptp_ts_aux_work(struct ptp_clock_info *ptp_info) bnxt_ptp_get_current_time(bp); ptp->next_period = now + HZ; if (time_after_eq(now, ptp->next_overflow_check)) { - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); timecounter_read(&ptp->tc); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); ptp->next_overflow_check = now + BNXT_PHC_OVERFLOW_PERIOD; } if (rc == -EAGAIN) @@ -819,6 +829,7 @@ void bnxt_tx_ts_cmp(struct bnxt *bp, struct bnxt_napi *bnapi, u32 opaque = tscmp->tx_ts_cmp_opaque; struct bnxt_tx_ring_info *txr; struct bnxt_sw_tx_bd *tx_buf; + unsigned long flags; u64 ts, ns; u16 cons; @@ -833,9 +844,9 @@ void bnxt_tx_ts_cmp(struct bnxt *bp, struct bnxt_napi *bnapi, le32_to_cpu(tscmp->tx_ts_cmp_flags_type), le32_to_cpu(tscmp->tx_ts_cmp_errors_v)); } else { - spin_lock_bh(&ptp->ptp_lock); + spin_lock_irqsave(&ptp->ptp_lock, flags); ns = timecounter_cyc2time(&ptp->tc, ts); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); timestamp.hwtstamp = ns_to_ktime(ns); skb_tstamp_tx(tx_buf->skb, ×tamp); } @@ -975,6 +986,7 @@ void bnxt_ptp_rtc_timecounter_init(struct bnxt_ptp_cfg *ptp, u64 ns) int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg) { struct timespec64 tsp; + unsigned long flags; u64 ns; int rc; @@ -993,9 +1005,9 @@ int bnxt_ptp_init_rtc(struct bnxt *bp, bool phc_cfg) if (rc) return rc; } - spin_lock_bh(&bp->ptp_cfg->ptp_lock); + spin_lock_irqsave(&bp->ptp_cfg->ptp_lock, flags); bnxt_ptp_rtc_timecounter_init(bp->ptp_cfg, ns); - spin_unlock_bh(&bp->ptp_cfg->ptp_lock); + spin_unlock_irqrestore(&bp->ptp_cfg->ptp_lock, flags); return 0; } @@ -1063,10 +1075,12 @@ int bnxt_ptp_init(struct bnxt *bp, bool phc_cfg) atomic64_set(&ptp->stats.ts_err, 0); if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) { - spin_lock_bh(&ptp->ptp_lock); + unsigned long flags; + + spin_lock_irqsave(&ptp->ptp_lock, flags); bnxt_refclk_read(bp, NULL, &ptp->current_time); WRITE_ONCE(ptp->old_time, ptp->current_time); - spin_unlock_bh(&ptp->ptp_lock); + spin_unlock_irqrestore(&ptp->ptp_lock, flags); ptp_schedule_worker(ptp->ptp_clock, 0); } ptp->txts_tmo = BNXT_PTP_DFLT_TX_TMO; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index a9a2f9a18c9c..f322466ecad3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -146,11 +146,13 @@ struct bnxt_ptp_cfg { }; #if BITS_PER_LONG == 32 -#define BNXT_READ_TIME64(ptp, dst, src) \ -do { \ - spin_lock_bh(&(ptp)->ptp_lock); \ - (dst) = (src); \ - spin_unlock_bh(&(ptp)->ptp_lock); \ +#define BNXT_READ_TIME64(ptp, dst, src) \ +do { \ + unsigned long flags; \ + \ + spin_lock_irqsave(&(ptp)->ptp_lock, flags); \ + (dst) = (src); \ + spin_unlock_irqrestore(&(ptp)->ptp_lock, flags); \ } while (0) #else #define BNXT_READ_TIME64(ptp, dst, src) \ From bd28df26197b2bd0913bf1b36770836481975143 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Thu, 17 Oct 2024 13:06:50 +0300 Subject: [PATCH 14/40] octeon_ep: Implement helper for iterating packets in Rx queue The common code with some packet and index manipulations is extracted and moved to newly implemented helper to make the code more readable and avoid duplication. This is a preparation for skb allocation failure handling. Found by Linux Verification Center (linuxtesting.org) with SVACE. Suggested-by: Simon Horman Suggested-by: Paolo Abeni Signed-off-by: Aleksandr Mishin Reviewed-by: Jacob Keller Signed-off-by: Andrew Lunn --- .../net/ethernet/marvell/octeon_ep/octep_rx.c | 55 +++++++++++-------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c index 4746a6b258f0..a889c1510518 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c @@ -336,6 +336,30 @@ static int octep_oq_check_hw_for_pkts(struct octep_device *oct, return new_pkts; } +/** + * octep_oq_next_pkt() - Move to the next packet in Rx queue. + * + * @oq: Octeon Rx queue data structure. + * @buff_info: Current packet buffer info. + * @read_idx: Current packet index in the ring. + * @desc_used: Current packet descriptor number. + * + * Free the resources associated with a packet. + * Increment packet index in the ring and packet descriptor number. + */ +static void octep_oq_next_pkt(struct octep_oq *oq, + struct octep_rx_buffer *buff_info, + u32 *read_idx, u32 *desc_used) +{ + dma_unmap_page(oq->dev, oq->desc_ring[*read_idx].buffer_ptr, + PAGE_SIZE, DMA_FROM_DEVICE); + buff_info->page = NULL; + (*read_idx)++; + (*desc_used)++; + if (*read_idx == oq->max_count) + *read_idx = 0; +} + /** * __octep_oq_process_rx() - Process hardware Rx queue and push to stack. * @@ -367,10 +391,7 @@ static int __octep_oq_process_rx(struct octep_device *oct, desc_used = 0; for (pkt = 0; pkt < pkts_to_process; pkt++) { buff_info = (struct octep_rx_buffer *)&oq->buff_info[read_idx]; - dma_unmap_page(oq->dev, oq->desc_ring[read_idx].buffer_ptr, - PAGE_SIZE, DMA_FROM_DEVICE); resp_hw = page_address(buff_info->page); - buff_info->page = NULL; /* Swap the length field that is in Big-Endian to CPU */ buff_info->len = be64_to_cpu(resp_hw->length); @@ -394,36 +415,27 @@ static int __octep_oq_process_rx(struct octep_device *oct, data_offset = OCTEP_OQ_RESP_HW_SIZE; rx_ol_flags = 0; } + + octep_oq_next_pkt(oq, buff_info, &read_idx, &desc_used); + + skb = build_skb((void *)resp_hw, PAGE_SIZE); + skb_reserve(skb, data_offset); + rx_bytes += buff_info->len; if (buff_info->len <= oq->max_single_buffer_size) { - skb = build_skb((void *)resp_hw, PAGE_SIZE); - skb_reserve(skb, data_offset); skb_put(skb, buff_info->len); - read_idx++; - desc_used++; - if (read_idx == oq->max_count) - read_idx = 0; } else { struct skb_shared_info *shinfo; u16 data_len; - skb = build_skb((void *)resp_hw, PAGE_SIZE); - skb_reserve(skb, data_offset); /* Head fragment includes response header(s); * subsequent fragments contains only data. */ skb_put(skb, oq->max_single_buffer_size); - read_idx++; - desc_used++; - if (read_idx == oq->max_count) - read_idx = 0; - shinfo = skb_shinfo(skb); data_len = buff_info->len - oq->max_single_buffer_size; while (data_len) { - dma_unmap_page(oq->dev, oq->desc_ring[read_idx].buffer_ptr, - PAGE_SIZE, DMA_FROM_DEVICE); buff_info = (struct octep_rx_buffer *) &oq->buff_info[read_idx]; if (data_len < oq->buffer_size) { @@ -438,11 +450,8 @@ static int __octep_oq_process_rx(struct octep_device *oct, buff_info->page, 0, buff_info->len, buff_info->len); - buff_info->page = NULL; - read_idx++; - desc_used++; - if (read_idx == oq->max_count) - read_idx = 0; + + octep_oq_next_pkt(oq, buff_info, &read_idx, &desc_used); } } From eb592008f79be52ccef88cd9a5249b3fc0367278 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Thu, 17 Oct 2024 13:06:51 +0300 Subject: [PATCH 15/40] octeon_ep: Add SKB allocation failures handling in __octep_oq_process_rx() build_skb() returns NULL in case of a memory allocation failure so handle it inside __octep_oq_process_rx() to avoid NULL pointer dereference. __octep_oq_process_rx() is called during NAPI polling by the driver. If skb allocation fails, keep on pulling packets out of the Rx DMA queue: we shouldn't break the polling immediately and thus falsely indicate to the octep_napi_poll() that the Rx pressure is going down. As there is no associated skb in this case, don't process the packets and don't push them up the network stack - they are skipped. Helper function is implemented to unmmap/flush all the fragment buffers used by the dropped packet. 'alloc_failures' counter is incremented to mark the skb allocation error in driver statistics. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 37d79d059606 ("octeon_ep: add Tx/Rx processing and interrupt support") Suggested-by: Paolo Abeni Signed-off-by: Aleksandr Mishin Reviewed-by: Jacob Keller Signed-off-by: Andrew Lunn --- .../net/ethernet/marvell/octeon_ep/octep_rx.c | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c index a889c1510518..8af75cb37c3e 100644 --- a/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c +++ b/drivers/net/ethernet/marvell/octeon_ep/octep_rx.c @@ -360,6 +360,27 @@ static void octep_oq_next_pkt(struct octep_oq *oq, *read_idx = 0; } +/** + * octep_oq_drop_rx() - Free the resources associated with a packet. + * + * @oq: Octeon Rx queue data structure. + * @buff_info: Current packet buffer info. + * @read_idx: Current packet index in the ring. + * @desc_used: Current packet descriptor number. + * + */ +static void octep_oq_drop_rx(struct octep_oq *oq, + struct octep_rx_buffer *buff_info, + u32 *read_idx, u32 *desc_used) +{ + int data_len = buff_info->len - oq->max_single_buffer_size; + + while (data_len > 0) { + octep_oq_next_pkt(oq, buff_info, read_idx, desc_used); + data_len -= oq->buffer_size; + }; +} + /** * __octep_oq_process_rx() - Process hardware Rx queue and push to stack. * @@ -419,6 +440,12 @@ static int __octep_oq_process_rx(struct octep_device *oct, octep_oq_next_pkt(oq, buff_info, &read_idx, &desc_used); skb = build_skb((void *)resp_hw, PAGE_SIZE); + if (!skb) { + octep_oq_drop_rx(oq, buff_info, + &read_idx, &desc_used); + oq->stats.alloc_failures++; + continue; + } skb_reserve(skb, data_offset); rx_bytes += buff_info->len; From 12bc14949c4a7272b509af0f1022a0deeb215fd8 Mon Sep 17 00:00:00 2001 From: Peter Rashleigh Date: Tue, 15 Oct 2024 21:08:22 -0700 Subject: [PATCH 16/40] net: dsa: mv88e6xxx: Fix error when setting port policy on mv88e6393x mv88e6393x_port_set_policy doesn't correctly shift the ptr value when converting the policy format between the old and new styles, so the target register ends up with the ptr being written over the data bits. Shift the pointer to align with the format expected by mv88e6393x_port_policy_write(). Fixes: 6584b26020fc ("net: dsa: mv88e6xxx: implement .port_set_policy for Amethyst") Signed-off-by: Peter Rashleigh Reviewed-by: Simon Horman Message-ID: <20241016040822.3917-1-peter@rashleigh.ca> Signed-off-by: Andrew Lunn --- drivers/net/dsa/mv88e6xxx/port.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 5394a8cf7bf1..04053fdc6489 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -1713,6 +1713,7 @@ int mv88e6393x_port_set_policy(struct mv88e6xxx_chip *chip, int port, ptr = shift / 8; shift %= 8; mask >>= ptr * 8; + ptr <<= 8; err = mv88e6393x_port_policy_read(chip, port, ptr, ®); if (err) From 3e14d8ebaa11e92325985c67cd147431a1116cd4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 16 Oct 2024 16:56:13 +0200 Subject: [PATCH 17/40] mailmap: update entry for Jesper Dangaard Brouer Mapping all my previously used emails to my kernel.org email. Signed-off-by: Jesper Dangaard Brouer Message-ID: <172909057364.2452383.8019986488234344607.stgit@firesoul> Signed-off-by: Andrew Lunn --- .mailmap | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.mailmap b/.mailmap index 54580db9c59b..6877a258e010 100644 --- a/.mailmap +++ b/.mailmap @@ -304,6 +304,11 @@ Jens Axboe Jens Axboe Jens Osterkamp Jernej Skrabec +Jesper Dangaard Brouer +Jesper Dangaard Brouer +Jesper Dangaard Brouer +Jesper Dangaard Brouer +Jesper Dangaard Brouer Jessica Zhang Jilai Wang Jiri Kosina From 3b05b9c36ddd01338e1352588f2ec1ea23f97d43 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 18 Oct 2024 00:53:01 +0000 Subject: [PATCH 18/40] MAINTAINERS: add samples/pktgen to NETWORKING [GENERAL] samples/pktgen is missing in the MAINTAINERS file. Suggested-by: Antoine Tenart Reviewed-by: Simon Horman Signed-off-by: Hangbin Liu Message-ID: <20241018005301.10052-1-liuhangbin@gmail.com> Signed-off-by: Andrew Lunn --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 86f64a2c7e1b..057f085680ab 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16195,6 +16195,7 @@ F: include/uapi/linux/rtnetlink.h F: lib/net_utils.c F: lib/random32.c F: net/ +F: samples/pktgen/ F: tools/net/ F: tools/testing/selftests/net/ X: Documentation/networking/mac80211-injection.rst From efeddd552ec6767e4c8884caa516ac80b65f8823 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 15 Oct 2024 09:01:21 +0300 Subject: [PATCH 19/40] fsl/fman: Save device references taken in mac_probe() In mac_probe() there are calls to of_find_device_by_node() which takes references to of_dev->dev. These references are not saved and not released later on error path in mac_probe() and in mac_remove(). Add new fields into mac_device structure to save references taken for future use in mac_probe() and mac_remove(). This is a preparation for further reference leaks fix. Signed-off-by: Aleksandr Mishin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fman/mac.c | 6 ++++-- drivers/net/ethernet/freescale/fman/mac.h | 6 +++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c index 9767586b4eb3..9b863db0bf08 100644 --- a/drivers/net/ethernet/freescale/fman/mac.c +++ b/drivers/net/ethernet/freescale/fman/mac.c @@ -197,6 +197,7 @@ static int mac_probe(struct platform_device *_of_dev) err = -EINVAL; goto _return_of_node_put; } + mac_dev->fman_dev = &of_dev->dev; /* Get the FMan cell-index */ err = of_property_read_u32(dev_node, "cell-index", &val); @@ -208,7 +209,7 @@ static int mac_probe(struct platform_device *_of_dev) /* cell-index 0 => FMan id 1 */ fman_id = (u8)(val + 1); - priv->fman = fman_bind(&of_dev->dev); + priv->fman = fman_bind(mac_dev->fman_dev); if (!priv->fman) { dev_err(dev, "fman_bind(%pOF) failed\n", dev_node); err = -ENODEV; @@ -284,8 +285,9 @@ static int mac_probe(struct platform_device *_of_dev) err = -EINVAL; goto _return_of_node_put; } + mac_dev->fman_port_devs[i] = &of_dev->dev; - mac_dev->port[i] = fman_port_bind(&of_dev->dev); + mac_dev->port[i] = fman_port_bind(mac_dev->fman_port_devs[i]); if (!mac_dev->port[i]) { dev_err(dev, "dev_get_drvdata(%pOF) failed\n", dev_node); diff --git a/drivers/net/ethernet/freescale/fman/mac.h b/drivers/net/ethernet/freescale/fman/mac.h index fe747915cc73..8b5b43d50f8e 100644 --- a/drivers/net/ethernet/freescale/fman/mac.h +++ b/drivers/net/ethernet/freescale/fman/mac.h @@ -19,12 +19,13 @@ struct fman_mac; struct mac_priv_s; +#define PORT_NUM 2 struct mac_device { void __iomem *vaddr; struct device *dev; struct resource *res; u8 addr[ETH_ALEN]; - struct fman_port *port[2]; + struct fman_port *port[PORT_NUM]; struct phylink *phylink; struct phylink_config phylink_config; phy_interface_t phy_if; @@ -52,6 +53,9 @@ struct mac_device { struct fman_mac *fman_mac; struct mac_priv_s *priv; + + struct device *fman_dev; + struct device *fman_port_devs[PORT_NUM]; }; static inline struct mac_device From 1dec67e0d9fbb087c2ab17bf1bd17208231c3bb1 Mon Sep 17 00:00:00 2001 From: Aleksandr Mishin Date: Tue, 15 Oct 2024 09:01:22 +0300 Subject: [PATCH 20/40] fsl/fman: Fix refcount handling of fman-related devices In mac_probe() there are multiple calls to of_find_device_by_node(), fman_bind() and fman_port_bind() which takes references to of_dev->dev. Not all references taken by these calls are released later on error path in mac_probe() and in mac_remove() which lead to reference leaks. Add references release. Fixes: 3933961682a3 ("fsl/fman: Add FMan MAC driver") Signed-off-by: Aleksandr Mishin Signed-off-by: Paolo Abeni --- drivers/net/ethernet/freescale/fman/mac.c | 62 +++++++++++++++++------ 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c index 9b863db0bf08..11da139082e1 100644 --- a/drivers/net/ethernet/freescale/fman/mac.c +++ b/drivers/net/ethernet/freescale/fman/mac.c @@ -204,7 +204,7 @@ static int mac_probe(struct platform_device *_of_dev) if (err) { dev_err(dev, "failed to read cell-index for %pOF\n", dev_node); err = -EINVAL; - goto _return_of_node_put; + goto _return_dev_put; } /* cell-index 0 => FMan id 1 */ fman_id = (u8)(val + 1); @@ -213,40 +213,51 @@ static int mac_probe(struct platform_device *_of_dev) if (!priv->fman) { dev_err(dev, "fman_bind(%pOF) failed\n", dev_node); err = -ENODEV; - goto _return_of_node_put; + goto _return_dev_put; } + /* Two references have been taken in of_find_device_by_node() + * and fman_bind(). Release one of them here. The second one + * will be released in mac_remove(). + */ + put_device(mac_dev->fman_dev); of_node_put(dev_node); + dev_node = NULL; /* Get the address of the memory mapped registers */ mac_dev->res = platform_get_mem_or_io(_of_dev, 0); if (!mac_dev->res) { dev_err(dev, "could not get registers\n"); - return -EINVAL; + err = -EINVAL; + goto _return_dev_put; } err = devm_request_resource(dev, fman_get_mem_region(priv->fman), mac_dev->res); if (err) { dev_err_probe(dev, err, "could not request resource\n"); - return err; + goto _return_dev_put; } mac_dev->vaddr = devm_ioremap(dev, mac_dev->res->start, resource_size(mac_dev->res)); if (!mac_dev->vaddr) { dev_err(dev, "devm_ioremap() failed\n"); - return -EIO; + err = -EIO; + goto _return_dev_put; } - if (!of_device_is_available(mac_node)) - return -ENODEV; + if (!of_device_is_available(mac_node)) { + err = -ENODEV; + goto _return_dev_put; + } /* Get the cell-index */ err = of_property_read_u32(mac_node, "cell-index", &val); if (err) { dev_err(dev, "failed to read cell-index for %pOF\n", mac_node); - return -EINVAL; + err = -EINVAL; + goto _return_dev_put; } priv->cell_index = (u8)val; @@ -260,22 +271,26 @@ static int mac_probe(struct platform_device *_of_dev) if (unlikely(nph < 0)) { dev_err(dev, "of_count_phandle_with_args(%pOF, fsl,fman-ports) failed\n", mac_node); - return nph; + err = nph; + goto _return_dev_put; } if (nph != ARRAY_SIZE(mac_dev->port)) { dev_err(dev, "Not supported number of fman-ports handles of mac node %pOF from device tree\n", mac_node); - return -EINVAL; + err = -EINVAL; + goto _return_dev_put; } - for (i = 0; i < ARRAY_SIZE(mac_dev->port); i++) { + /* PORT_NUM determines the size of the port array */ + for (i = 0; i < PORT_NUM; i++) { /* Find the port node */ dev_node = of_parse_phandle(mac_node, "fsl,fman-ports", i); if (!dev_node) { dev_err(dev, "of_parse_phandle(%pOF, fsl,fman-ports) failed\n", mac_node); - return -EINVAL; + err = -EINVAL; + goto _return_dev_arr_put; } of_dev = of_find_device_by_node(dev_node); @@ -283,7 +298,7 @@ static int mac_probe(struct platform_device *_of_dev) dev_err(dev, "of_find_device_by_node(%pOF) failed\n", dev_node); err = -EINVAL; - goto _return_of_node_put; + goto _return_dev_arr_put; } mac_dev->fman_port_devs[i] = &of_dev->dev; @@ -292,9 +307,15 @@ static int mac_probe(struct platform_device *_of_dev) dev_err(dev, "dev_get_drvdata(%pOF) failed\n", dev_node); err = -EINVAL; - goto _return_of_node_put; + goto _return_dev_arr_put; } + /* Two references have been taken in of_find_device_by_node() + * and fman_port_bind(). Release one of them here. The second + * one will be released in mac_remove(). + */ + put_device(mac_dev->fman_port_devs[i]); of_node_put(dev_node); + dev_node = NULL; } /* Get the PHY connection type */ @@ -314,7 +335,7 @@ static int mac_probe(struct platform_device *_of_dev) err = init(mac_dev, mac_node, ¶ms); if (err < 0) - return err; + goto _return_dev_arr_put; if (!is_zero_ether_addr(mac_dev->addr)) dev_info(dev, "FMan MAC address: %pM\n", mac_dev->addr); @@ -329,6 +350,12 @@ static int mac_probe(struct platform_device *_of_dev) return err; +_return_dev_arr_put: + /* mac_dev is kzalloc'ed */ + for (i = 0; i < PORT_NUM; i++) + put_device(mac_dev->fman_port_devs[i]); +_return_dev_put: + put_device(mac_dev->fman_dev); _return_of_node_put: of_node_put(dev_node); return err; @@ -337,6 +364,11 @@ static int mac_probe(struct platform_device *_of_dev) static void mac_remove(struct platform_device *pdev) { struct mac_device *mac_dev = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < PORT_NUM; i++) + put_device(mac_dev->fman_port_devs[i]); + put_device(mac_dev->fman_dev); platform_device_unregister(mac_dev->priv->eth_dev); } From 306ed1728e8438caed30332e1ab46b28c25fe3d8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 20 Oct 2024 14:49:51 +0200 Subject: [PATCH 21/40] netfilter: xtables: fix typo causing some targets not to load on IPv6 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - There is no NFPROTO_IPV6 family for mark and NFLOG. - TRACE is also missing module autoload with NFPROTO_IPV6. This results in ip6tables failing to restore a ruleset. This issue has been reported by several users providing incomplete patches. Very similar to Ilya Katsnelson's patch including a missing chunk in the TRACE extension. Fixes: 0bfcb7b71e73 ("netfilter: xtables: avoid NFPROTO_UNSPEC where needed") Reported-by: Ignat Korchagin Reported-by: Ilya Katsnelson Reported-by: Krzysztof Olędzki Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_NFLOG.c | 2 +- net/netfilter/xt_TRACE.c | 1 + net/netfilter/xt_mark.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index d80abd6ccaf8..6dcf4bc7e30b 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -79,7 +79,7 @@ static struct xt_target nflog_tg_reg[] __read_mostly = { { .name = "NFLOG", .revision = 0, - .family = NFPROTO_IPV4, + .family = NFPROTO_IPV6, .checkentry = nflog_tg_check, .destroy = nflog_tg_destroy, .target = nflog_tg, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index f3fa4f11348c..a642ff09fc8e 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -49,6 +49,7 @@ static struct xt_target trace_tg_reg[] __read_mostly = { .target = trace_tg, .checkentry = trace_tg_check, .destroy = trace_tg_destroy, + .me = THIS_MODULE, }, #endif }; diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index f76fe04fc9a4..65b965ca40ea 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -62,7 +62,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = NFPROTO_IPV4, + .family = NFPROTO_IPV6, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, From 47dd5447cab8ce30a847a0337d5341ae4c7476a7 Mon Sep 17 00:00:00 2001 From: Lin Ma Date: Tue, 15 Oct 2024 21:16:21 +0800 Subject: [PATCH 22/40] net: wwan: fix global oob in wwan_rtnl_policy The variable wwan_rtnl_link_ops assign a *bigger* maxtype which leads to a global out-of-bounds read when parsing the netlink attributes. Exactly same bug cause as the oob fixed in commit b33fb5b801c6 ("net: qualcomm: rmnet: fix global oob in rmnet_policy"). ================================================================== BUG: KASAN: global-out-of-bounds in validate_nla lib/nlattr.c:388 [inline] BUG: KASAN: global-out-of-bounds in __nla_validate_parse+0x19d7/0x29a0 lib/nlattr.c:603 Read of size 1 at addr ffffffff8b09cb60 by task syz.1.66276/323862 CPU: 0 PID: 323862 Comm: syz.1.66276 Not tainted 6.1.70 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x177/0x231 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x14f/0x750 mm/kasan/report.c:395 kasan_report+0x139/0x170 mm/kasan/report.c:495 validate_nla lib/nlattr.c:388 [inline] __nla_validate_parse+0x19d7/0x29a0 lib/nlattr.c:603 __nla_parse+0x3c/0x50 lib/nlattr.c:700 nla_parse_nested_deprecated include/net/netlink.h:1269 [inline] __rtnl_newlink net/core/rtnetlink.c:3514 [inline] rtnl_newlink+0x7bc/0x1fd0 net/core/rtnetlink.c:3623 rtnetlink_rcv_msg+0x794/0xef0 net/core/rtnetlink.c:6122 netlink_rcv_skb+0x1de/0x420 net/netlink/af_netlink.c:2508 netlink_unicast_kernel net/netlink/af_netlink.c:1326 [inline] netlink_unicast+0x74b/0x8c0 net/netlink/af_netlink.c:1352 netlink_sendmsg+0x882/0xb90 net/netlink/af_netlink.c:1874 sock_sendmsg_nosec net/socket.c:716 [inline] __sock_sendmsg net/socket.c:728 [inline] ____sys_sendmsg+0x5cc/0x8f0 net/socket.c:2499 ___sys_sendmsg+0x21c/0x290 net/socket.c:2553 __sys_sendmsg net/socket.c:2582 [inline] __do_sys_sendmsg net/socket.c:2591 [inline] __se_sys_sendmsg+0x19e/0x270 net/socket.c:2589 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x45/0x90 arch/x86/entry/common.c:81 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x7f67b19a24ad RSP: 002b:00007f67b17febb8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f67b1b45f80 RCX: 00007f67b19a24ad RDX: 0000000000000000 RSI: 0000000020005e40 RDI: 0000000000000004 RBP: 00007f67b1a1e01d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd2513764f R14: 00007ffd251376e0 R15: 00007f67b17fed40 The buggy address belongs to the variable: wwan_rtnl_policy+0x20/0x40 The buggy address belongs to the physical page: page:ffffea00002c2700 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0xb09c flags: 0xfff00000001000(reserved|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000001000 ffffea00002c2708 ffffea00002c2708 0000000000000000 raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner info is not present (never set?) Memory state around the buggy address: ffffffff8b09ca00: 05 f9 f9 f9 05 f9 f9 f9 00 01 f9 f9 00 01 f9 f9 ffffffff8b09ca80: 00 00 00 05 f9 f9 f9 f9 00 00 03 f9 f9 f9 f9 f9 >ffffffff8b09cb00: 00 00 00 00 05 f9 f9 f9 00 00 00 00 f9 f9 f9 f9 ^ ffffffff8b09cb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ================================================================== According to the comment of `nla_parse_nested_deprecated`, use correct size `IFLA_WWAN_MAX` here to fix this issue. Fixes: 88b710532e53 ("wwan: add interface creation support") Signed-off-by: Lin Ma Reviewed-by: Loic Poulain Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241015131621.47503-1-linma@zju.edu.cn Signed-off-by: Paolo Abeni --- drivers/net/wwan/wwan_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index 17431f1b1a0c..65a7ed4d6766 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -1038,7 +1038,7 @@ static const struct nla_policy wwan_rtnl_policy[IFLA_WWAN_MAX + 1] = { static struct rtnl_link_ops wwan_rtnl_link_ops __read_mostly = { .kind = "wwan", - .maxtype = __IFLA_WWAN_MAX, + .maxtype = IFLA_WWAN_MAX, .alloc = wwan_rtnl_alloc, .validate = wwan_rtnl_validate, .newlink = wwan_rtnl_newlink, From 95ecba62e2fd201bcdcca636f5d774f1cd4f1458 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Oct 2024 19:41:18 +0000 Subject: [PATCH 23/40] net: fix races in netdev_tx_sent_queue()/dev_watchdog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some workloads hit the infamous dev_watchdog() message: "NETDEV WATCHDOG: eth0 (xxxx): transmit queue XX timed out" It seems possible to hit this even for perfectly normal BQL enabled drivers: 1) Assume a TX queue was idle for more than dev->watchdog_timeo (5 seconds unless changed by the driver) 2) Assume a big packet is sent, exceeding current BQL limit. 3) Driver ndo_start_xmit() puts the packet in TX ring, and netdev_tx_sent_queue() is called. 4) QUEUE_STATE_STACK_XOFF could be set from netdev_tx_sent_queue() before txq->trans_start has been written. 5) txq->trans_start is written later, from netdev_start_xmit() if (rc == NETDEV_TX_OK) txq_trans_update(txq) dev_watchdog() running on another cpu could read the old txq->trans_start, and then see QUEUE_STATE_STACK_XOFF, because 5) did not happen yet. To solve the issue, write txq->trans_start right before one XOFF bit is set : - _QUEUE_STATE_DRV_XOFF from netif_tx_stop_queue() - __QUEUE_STATE_STACK_XOFF from netdev_tx_sent_queue() From dev_watchdog(), we have to read txq->state before txq->trans_start. Add memory barriers to enforce correct ordering. In the future, we could avoid writing over txq->trans_start for normal operations, and rename this field to txq->xoff_start_time. Fixes: bec251bc8b6a ("net: no longer stop all TX queues in dev_watchdog()") Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Reviewed-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241015194118.3951657-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 12 ++++++++++++ net/sched/sch_generic.c | 8 +++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4d20c776a4ff..8896705ccd63 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3325,6 +3325,12 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev) static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue) { + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + /* Must be an atomic op see netif_txq_try_stop() */ set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state); } @@ -3451,6 +3457,12 @@ static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, if (likely(dql_avail(&dev_queue->dql) >= 0)) return; + /* Paired with READ_ONCE() from dev_watchdog() */ + WRITE_ONCE(dev_queue->trans_start, jiffies); + + /* This barrier is paired with smp_mb() from dev_watchdog() */ + smp_mb__before_atomic(); + set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state); /* diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2af24547a82c..38ec18f73de4 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -512,9 +512,15 @@ static void dev_watchdog(struct timer_list *t) struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); - trans_start = READ_ONCE(txq->trans_start); if (!netif_xmit_stopped(txq)) continue; + + /* Paired with WRITE_ONCE() + smp_mb...() in + * netdev_tx_sent_queue() and netif_tx_stop_queue(). + */ + smp_mb(); + trans_start = READ_ONCE(txq->trans_start); + if (time_after(jiffies, trans_start + dev->watchdog_timeo)) { timedout_ms = jiffies_to_msecs(jiffies - trans_start); atomic_long_inc(&txq->trans_timeout); From d95d9a31aceb2021084bc9b94647bc5b175e05e7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Wed, 16 Oct 2024 13:27:07 -0400 Subject: [PATCH 24/40] virtio_net: fix integer overflow in stats Static analysis on linux-next has detected the following issue in function virtnet_stats_ctx_init, in drivers/net/virtio_net.c : if (vi->device_stats_cap & VIRTIO_NET_STATS_TYPE_CVQ) { queue_type = VIRTNET_Q_TYPE_CQ; ctx->bitmap[queue_type] |= VIRTIO_NET_STATS_TYPE_CVQ; ctx->desc_num[queue_type] += ARRAY_SIZE(virtnet_stats_cvq_desc); ctx->size[queue_type] += sizeof(struct virtio_net_stats_cvq); } ctx->bitmap is declared as a u32 however it is being bit-wise or'd with VIRTIO_NET_STATS_TYPE_CVQ and this is defined as 1 << 32: include/uapi/linux/virtio_net.h:#define VIRTIO_NET_STATS_TYPE_CVQ (1ULL << 32) ..and hence the bit-wise or operation won't set any bits in ctx->bitmap because 1ULL < 32 is too wide for a u32. In fact, the field is read into a u64: u64 offset, bitmap; .... bitmap = ctx->bitmap[queue_type]; so to fix, it is enough to make bitmap an array of u64. Fixes: 941168f8b40e5 ("virtio_net: support device stats") Reported-by: "Colin King (gmail)" Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Reviewed-by: Stefano Garzarella Link: https://patch.msgid.link/53e2bd6728136d5916e384a7840e5dc7eebff832.1729099611.git.mst@redhat.com Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index f8131f92a392..792e9eadbfc3 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -4155,7 +4155,7 @@ struct virtnet_stats_ctx { u32 desc_num[3]; /* The actual supported stat types. */ - u32 bitmap[3]; + u64 bitmap[3]; /* Used to calculate the reply buffer size. */ u32 size[3]; From f7b4cf0306bbea500a613e4b618576452c1df4ba Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Thu, 17 Oct 2024 10:32:23 +0800 Subject: [PATCH 25/40] mlxsw: spectrum_router: fix xa_store() error checking It is meant to use xa_err() to extract the error encoded in the return value of xa_store(). Fixes: 44c2fbebe18a ("mlxsw: spectrum_router: Share nexthop counters in resilient groups") Signed-off-by: Yuan Can Reviewed-by: Petr Machata Reviewed-by: Przemek Kitszel Tested-by: Petr Machata Link: https://patch.msgid.link/20241017023223.74180-1-yuancan@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 800dfb64ec83..7d6d859cef3f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3197,7 +3197,6 @@ mlxsw_sp_nexthop_sh_counter_get(struct mlxsw_sp *mlxsw_sp, { struct mlxsw_sp_nexthop_group *nh_grp = nh->nhgi->nh_grp; struct mlxsw_sp_nexthop_counter *nhct; - void *ptr; int err; nhct = xa_load(&nh_grp->nhgi->nexthop_counters, nh->id); @@ -3210,12 +3209,10 @@ mlxsw_sp_nexthop_sh_counter_get(struct mlxsw_sp *mlxsw_sp, if (IS_ERR(nhct)) return nhct; - ptr = xa_store(&nh_grp->nhgi->nexthop_counters, nh->id, nhct, - GFP_KERNEL); - if (IS_ERR(ptr)) { - err = PTR_ERR(ptr); + err = xa_err(xa_store(&nh_grp->nhgi->nexthop_counters, nh->id, nhct, + GFP_KERNEL)); + if (err) goto err_store; - } return nhct; From 8a7d12d674ac6f2147c18f36d1e15f1a48060edf Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 17 Oct 2024 09:18:37 +0200 Subject: [PATCH 26/40] net: usb: usbnet: fix name regression The fix for MAC addresses broke detection of the naming convention because it gave network devices no random MAC before bind() was called. This means that the check for the local assignment bit was always negative as the address was zeroed from allocation, instead of from overwriting the MAC with a unique hardware address. The correct check for whether bind() has altered the MAC is done with is_zero_ether_addr Signed-off-by: Oliver Neukum Reported-by: Greg Thelen Diagnosed-by: John Sperbeck Fixes: bab8eb0dd4cb9 ("usbnet: modern method to get random MAC") Link: https://patch.msgid.link/20241017071849.389636-1-oneukum@suse.com Signed-off-by: Paolo Abeni --- drivers/net/usb/usbnet.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index ee1b5fd7b491..44179f4e807f 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1767,7 +1767,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) // can rename the link if it knows better. if ((dev->driver_info->flags & FLAG_ETHER) != 0 && ((dev->driver_info->flags & FLAG_POINTTOPOINT) == 0 || - (net->dev_addr [0] & 0x02) == 0)) + /* somebody touched it*/ + !is_zero_ether_addr(net->dev_addr))) strscpy(net->name, "eth%d", sizeof(net->name)); /* WLAN devices should always be named "wlan%d" */ if ((dev->driver_info->flags & FLAG_WLAN) != 0) From 34d35b4edbbe890a91bec939bfd29ad92517a52b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Oct 2024 19:10:48 +0300 Subject: [PATCH 27/40] net/sched: act_api: deny mismatched skip_sw/skip_hw flags for actions created by classifiers tcf_action_init() has logic for checking mismatches between action and filter offload flags (skip_sw/skip_hw). AFAIU, this is intended to run on the transition between the new tc_act_bind(flags) returning true (aka now gets bound to classifier) and tc_act_bind(act->tcfa_flags) returning false (aka action was not bound to classifier before). Otherwise, the check is skipped. For the case where an action is not standalone, but rather it was created by a classifier and is bound to it, tcf_action_init() skips the check entirely, and this means it allows mismatched flags to occur. Taking the matchall classifier code path as an example (with mirred as an action), the reason is the following: 1 | mall_change() 2 | -> mall_replace_hw_filter() 3 | -> tcf_exts_validate_ex() 4 | -> flags |= TCA_ACT_FLAGS_BIND; 5 | -> tcf_action_init() 6 | -> tcf_action_init_1() 7 | -> a_o->init() 8 | -> tcf_mirred_init() 9 | -> tcf_idr_create_from_flags() 10 | -> tcf_idr_create() 11 | -> p->tcfa_flags = flags; 12 | -> tc_act_bind(flags)) 13 | -> tc_act_bind(act->tcfa_flags) When invoked from tcf_exts_validate_ex() like matchall does (but other classifiers validate their extensions as well), tcf_action_init() runs in a call path where "flags" always contains TCA_ACT_FLAGS_BIND (set by line 4). So line 12 is always true, and line 13 is always true as well. No transition ever takes place, and the check is skipped. The code was added in this form in commit c86e0209dc77 ("flow_offload: validate flags of filter and actions"), but I'm attributing the blame even earlier in that series, to when TCA_ACT_FLAGS_SKIP_HW and TCA_ACT_FLAGS_SKIP_SW were added to the UAPI. Following the development process of this change, the check did not always exist in this form. A change took place between v3 [1] and v4 [2], AFAIU due to review feedback that it doesn't make sense for action flags to be different than classifier flags. I think I agree with that feedback, but it was translated into code that omits enforcing this for "classic" actions created at the same time with the filters themselves. There are 3 more important cases to discuss. First there is this command: $ tc qdisc add dev eth0 clasct $ tc filter add dev eth0 ingress matchall skip_sw \ action mirred ingress mirror dev eth1 which should be allowed, because prior to the concept of dedicated action flags, it used to work and it used to mean the action inherited the skip_sw/skip_hw flags from the classifier. It's not a mismatch. Then we have this command: $ tc qdisc add dev eth0 clasct $ tc filter add dev eth0 ingress matchall skip_sw \ action mirred ingress mirror dev eth1 skip_hw where there is a mismatch and it should be rejected. Finally, we have: $ tc qdisc add dev eth0 clasct $ tc filter add dev eth0 ingress matchall skip_sw \ action mirred ingress mirror dev eth1 skip_sw where the offload flags coincide, and this should be treated the same as the first command based on inheritance, and accepted. [1]: https://lore.kernel.org/netdev/20211028110646.13791-9-simon.horman@corigine.com/ [2]: https://lore.kernel.org/netdev/20211118130805.23897-10-simon.horman@corigine.com/ Fixes: 7adc57651211 ("flow_offload: add skip_hw and skip_sw to control if offload the action") Signed-off-by: Vladimir Oltean Reviewed-by: Simon Horman Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20241017161049.3570037-1-vladimir.oltean@nxp.com Signed-off-by: Paolo Abeni --- net/sched/act_api.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 2714c4ed928e..eecad65fec92 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1498,8 +1498,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, bool skip_sw = tc_skip_sw(fl_flags); bool skip_hw = tc_skip_hw(fl_flags); - if (tc_act_bind(act->tcfa_flags)) + if (tc_act_bind(act->tcfa_flags)) { + /* Action is created by classifier and is not + * standalone. Check that the user did not set + * any action flags different than the + * classifier flags, and inherit the flags from + * the classifier for the compatibility case + * where no flags were specified at all. + */ + if ((tc_act_skip_sw(act->tcfa_flags) && !skip_sw) || + (tc_act_skip_hw(act->tcfa_flags) && !skip_hw)) { + NL_SET_ERR_MSG(extack, + "Mismatch between action and filter offload flags"); + err = -EINVAL; + goto err; + } + if (skip_sw) + act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_SW; + if (skip_hw) + act->tcfa_flags |= TCA_ACT_FLAGS_SKIP_HW; continue; + } + + /* Action is standalone */ if (skip_sw != tc_act_skip_sw(act->tcfa_flags) || skip_hw != tc_act_skip_hw(act->tcfa_flags)) { NL_SET_ERR_MSG(extack, From f504465970aebb2467da548f7c1efbbf36d0f44b Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 18 Oct 2024 08:13:38 +0300 Subject: [PATCH 28/40] net: sched: fix use-after-free in taprio_change() In 'taprio_change()', 'admin' pointer may become dangling due to sched switch / removal caused by 'advance_sched()', and critical section protected by 'q->current_entry_lock' is too small to prevent from such a scenario (which causes use-after-free detected by KASAN). Fix this by prefer 'rcu_replace_pointer()' over 'rcu_assign_pointer()' to update 'admin' immediately before an attempt to schedule freeing. Fixes: a3d43c0d56f1 ("taprio: Add support adding an admin schedule") Reported-by: syzbot+b65e0af58423fc8a73aa@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=b65e0af58423fc8a73aa Acked-by: Vinicius Costa Gomes Signed-off-by: Dmitry Antipov Link: https://patch.msgid.link/20241018051339.418890-1-dmantipov@yandex.ru Signed-off-by: Paolo Abeni --- net/sched/sch_taprio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 8498d0606b24..9f4e004cdb8b 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1965,7 +1965,8 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_start_sched(sch, start, new_admin); - rcu_assign_pointer(q->admin_sched, new_admin); + admin = rcu_replace_pointer(q->admin_sched, new_admin, + lockdep_rtnl_is_held()); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); From b22db8b8befe90b61c98626ca1a2fbb0505e9fe3 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 18 Oct 2024 08:13:39 +0300 Subject: [PATCH 29/40] net: sched: use RCU read-side critical section in taprio_dump() Fix possible use-after-free in 'taprio_dump()' by adding RCU read-side critical section there. Never seen on x86 but found on a KASAN-enabled arm64 system when investigating https://syzkaller.appspot.com/bug?extid=b65e0af58423fc8a73aa: [T15862] BUG: KASAN: slab-use-after-free in taprio_dump+0xa0c/0xbb0 [T15862] Read of size 4 at addr ffff0000d4bb88f8 by task repro/15862 [T15862] [T15862] CPU: 0 UID: 0 PID: 15862 Comm: repro Not tainted 6.11.0-rc1-00293-gdefaf1a2113a-dirty #2 [T15862] Hardware name: QEMU QEMU Virtual Machine, BIOS edk2-20240524-5.fc40 05/24/2024 [T15862] Call trace: [T15862] dump_backtrace+0x20c/0x220 [T15862] show_stack+0x2c/0x40 [T15862] dump_stack_lvl+0xf8/0x174 [T15862] print_report+0x170/0x4d8 [T15862] kasan_report+0xb8/0x1d4 [T15862] __asan_report_load4_noabort+0x20/0x2c [T15862] taprio_dump+0xa0c/0xbb0 [T15862] tc_fill_qdisc+0x540/0x1020 [T15862] qdisc_notify.isra.0+0x330/0x3a0 [T15862] tc_modify_qdisc+0x7b8/0x1838 [T15862] rtnetlink_rcv_msg+0x3c8/0xc20 [T15862] netlink_rcv_skb+0x1f8/0x3d4 [T15862] rtnetlink_rcv+0x28/0x40 [T15862] netlink_unicast+0x51c/0x790 [T15862] netlink_sendmsg+0x79c/0xc20 [T15862] __sock_sendmsg+0xe0/0x1a0 [T15862] ____sys_sendmsg+0x6c0/0x840 [T15862] ___sys_sendmsg+0x1ac/0x1f0 [T15862] __sys_sendmsg+0x110/0x1d0 [T15862] __arm64_sys_sendmsg+0x74/0xb0 [T15862] invoke_syscall+0x88/0x2e0 [T15862] el0_svc_common.constprop.0+0xe4/0x2a0 [T15862] do_el0_svc+0x44/0x60 [T15862] el0_svc+0x50/0x184 [T15862] el0t_64_sync_handler+0x120/0x12c [T15862] el0t_64_sync+0x190/0x194 [T15862] [T15862] Allocated by task 15857: [T15862] kasan_save_stack+0x3c/0x70 [T15862] kasan_save_track+0x20/0x3c [T15862] kasan_save_alloc_info+0x40/0x60 [T15862] __kasan_kmalloc+0xd4/0xe0 [T15862] __kmalloc_cache_noprof+0x194/0x334 [T15862] taprio_change+0x45c/0x2fe0 [T15862] tc_modify_qdisc+0x6a8/0x1838 [T15862] rtnetlink_rcv_msg+0x3c8/0xc20 [T15862] netlink_rcv_skb+0x1f8/0x3d4 [T15862] rtnetlink_rcv+0x28/0x40 [T15862] netlink_unicast+0x51c/0x790 [T15862] netlink_sendmsg+0x79c/0xc20 [T15862] __sock_sendmsg+0xe0/0x1a0 [T15862] ____sys_sendmsg+0x6c0/0x840 [T15862] ___sys_sendmsg+0x1ac/0x1f0 [T15862] __sys_sendmsg+0x110/0x1d0 [T15862] __arm64_sys_sendmsg+0x74/0xb0 [T15862] invoke_syscall+0x88/0x2e0 [T15862] el0_svc_common.constprop.0+0xe4/0x2a0 [T15862] do_el0_svc+0x44/0x60 [T15862] el0_svc+0x50/0x184 [T15862] el0t_64_sync_handler+0x120/0x12c [T15862] el0t_64_sync+0x190/0x194 [T15862] [T15862] Freed by task 6192: [T15862] kasan_save_stack+0x3c/0x70 [T15862] kasan_save_track+0x20/0x3c [T15862] kasan_save_free_info+0x4c/0x80 [T15862] poison_slab_object+0x110/0x160 [T15862] __kasan_slab_free+0x3c/0x74 [T15862] kfree+0x134/0x3c0 [T15862] taprio_free_sched_cb+0x18c/0x220 [T15862] rcu_core+0x920/0x1b7c [T15862] rcu_core_si+0x10/0x1c [T15862] handle_softirqs+0x2e8/0xd64 [T15862] __do_softirq+0x14/0x20 Fixes: 18cdd2f0998a ("net/sched: taprio: taprio_dump and taprio_change are protected by rtnl_mutex") Acked-by: Vinicius Costa Gomes Signed-off-by: Dmitry Antipov Link: https://patch.msgid.link/20241018051339.418890-2-dmantipov@yandex.ru Signed-off-by: Paolo Abeni --- net/sched/sch_taprio.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9f4e004cdb8b..8623dc0bafc0 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -2374,9 +2374,6 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; - oper = rtnl_dereference(q->oper_sched); - admin = rtnl_dereference(q->admin_sched); - mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -2397,18 +2394,23 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; + rcu_read_lock(); + + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); + if (oper && taprio_dump_tc_entries(skb, q, oper)) - goto options_error; + goto options_error_rcu; if (oper && dump_schedule(skb, oper)) - goto options_error; + goto options_error_rcu; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) - goto options_error; + goto options_error_rcu; if (dump_schedule(skb, admin)) goto admin_error; @@ -2416,11 +2418,15 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_nest_end(skb, sched_nest); done: + rcu_read_unlock(); return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); +options_error_rcu: + rcu_read_unlock(); + options_error: nla_nest_cancel(skb, nest); From 10ce0db787004875f4dba068ea952207d1d8abeb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 18 Oct 2024 11:08:16 +0200 Subject: [PATCH 30/40] r8169: avoid unsolicited interrupts It was reported that after resume from suspend a PCI error is logged and connectivity is broken. Error message is: PCI error (cmd = 0x0407, status_errs = 0x0000) The message seems to be a red herring as none of the error bits is set, and the PCI command register value also is normal. Exception handling for a PCI error includes a chip reset what apparently brakes connectivity here. The interrupt status bit triggering the PCI error handling isn't actually used on PCIe chip versions, so it's not clear why this bit is set by the chip. Fix this by ignoring this bit on PCIe chip versions. Fixes: 0e4851502f84 ("r8169: merge with version 8.001.00 of Realtek's r8168 driver") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219388 Tested-by: Atlas Yu Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/78e2f535-438f-4212-ad94-a77637ac6c9c@gmail.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/realtek/r8169_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 0cc9baaecb1b..713a89bb21e9 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4682,7 +4682,9 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) if ((status & 0xffff) == 0xffff || !(status & tp->irq_mask)) return IRQ_NONE; - if (unlikely(status & SYSErr)) { + /* At least RTL8168fp may unexpectedly set the SYSErr bit */ + if (unlikely(status & SYSErr && + tp->mac_version <= RTL_GIGA_MAC_VER_06)) { rtl8169_pcierr_interrupt(tp->dev); goto out; } From 6e62807c7fbb3c758d233018caf94dfea9c65dbd Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Fri, 18 Oct 2024 18:07:48 +0800 Subject: [PATCH 31/40] posix-clock: posix-clock: Fix unbalanced locking in pc_clock_settime() If get_clock_desc() succeeds, it calls fget() for the clockid's fd, and get the clk->rwsem read lock, so the error path should release the lock to make the lock balance and fput the clockid's fd to make the refcount balance and release the fd related resource. However the below commit left the error path locked behind resulting in unbalanced locking. Check timespec64_valid_strict() before get_clock_desc() to fix it, because the "ts" is not changed after that. Fixes: d8794ac20a29 ("posix-clock: Fix missing timespec64 check in pc_clock_settime()") Acked-by: Richard Cochran Signed-off-by: Jinjie Ruan Acked-by: Anna-Maria Behnsen [pabeni@redhat.com: fixed commit message typo] Signed-off-by: Paolo Abeni --- kernel/time/posix-clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 316a4e8c97d3..1af0bb2cc45c 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -309,6 +309,9 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) struct posix_clock_desc cd; int err; + if (!timespec64_valid_strict(ts)) + return -EINVAL; + err = get_clock_desc(id, &cd); if (err) return err; @@ -318,9 +321,6 @@ static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) goto out; } - if (!timespec64_valid_strict(ts)) - return -EINVAL; - if (cd.clk->ops.clock_settime) err = cd.clk->ops.clock_settime(cd.clk, ts); else From 989fa5171f005ecf63440057218d8aeb1795287d Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 22 Oct 2024 11:09:13 -0400 Subject: [PATCH 32/40] Bluetooth: hci_core: Disable works on hci_unregister_dev This make use of disable_work_* on hci_unregister_dev since the hci_dev is about to be freed new submissions are not disarable. Fixes: 0d151a103775 ("Bluetooth: hci_core: cancel all works upon hci_unregister_dev()") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 24 +++++++++++++++--------- net/bluetooth/hci_sync.c | 12 +++++++++--- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 629c302f7407..96d097b21d13 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1644,12 +1644,12 @@ void hci_adv_instances_clear(struct hci_dev *hdev) struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { - cancel_delayed_work(&hdev->adv_instance_expire); + disable_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { - cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); + disable_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } @@ -2685,11 +2685,11 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); - cancel_work_sync(&hdev->rx_work); - cancel_work_sync(&hdev->cmd_work); - cancel_work_sync(&hdev->tx_work); - cancel_work_sync(&hdev->power_on); - cancel_work_sync(&hdev->error_reset); + disable_work_sync(&hdev->rx_work); + disable_work_sync(&hdev->cmd_work); + disable_work_sync(&hdev->tx_work); + disable_work_sync(&hdev->power_on); + disable_work_sync(&hdev->error_reset); hci_cmd_sync_clear(hdev); @@ -2796,8 +2796,14 @@ static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err) { bt_dev_dbg(hdev, "err 0x%2.2x", err); - cancel_delayed_work_sync(&hdev->cmd_timer); - cancel_delayed_work_sync(&hdev->ncmd_timer); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + disable_delayed_work_sync(&hdev->cmd_timer); + disable_delayed_work_sync(&hdev->ncmd_timer); + } else { + cancel_delayed_work_sync(&hdev->cmd_timer); + cancel_delayed_work_sync(&hdev->ncmd_timer); + } + atomic_set(&hdev->cmd_cnt, 1); hci_cmd_sync_cancel_sync(hdev, err); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 40ccdef168d7..ae7a5817883a 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -5131,9 +5131,15 @@ int hci_dev_close_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); - cancel_delayed_work(&hdev->power_off); - cancel_delayed_work(&hdev->ncmd_timer); - cancel_delayed_work(&hdev->le_scan_disable); + if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) { + disable_delayed_work(&hdev->power_off); + disable_delayed_work(&hdev->ncmd_timer); + disable_delayed_work(&hdev->le_scan_disable); + } else { + cancel_delayed_work(&hdev->power_off); + cancel_delayed_work(&hdev->ncmd_timer); + cancel_delayed_work(&hdev->le_scan_disable); + } hci_cmd_sync_cancel_sync(hdev, ENODEV); From 1bf4470a3939c678fb822073e9ea77a0560bc6bb Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 22 Oct 2024 12:31:08 -0400 Subject: [PATCH 33/40] Bluetooth: SCO: Fix UAF on sco_sock_timeout conn->sk maybe have been unlinked/freed while waiting for sco_conn_lock so this checks if the conn->sk is still valid by checking if it part of sco_sk_list. Reported-by: syzbot+4c0d0c4cde787116d465@syzkaller.appspotmail.com Tested-by: syzbot+4c0d0c4cde787116d465@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4c0d0c4cde787116d465 Fixes: ba316be1b6a0 ("Bluetooth: schedule SCO timeouts with delayed_work") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 1 + net/bluetooth/af_bluetooth.c | 22 ++++++++++++++++++++++ net/bluetooth/sco.c | 18 ++++++++++++------ 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 5d655e109b2c..f66bc85c6411 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -403,6 +403,7 @@ int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); +bool bt_sock_linked(struct bt_sock_list *l, struct sock *s); struct sock *bt_sock_alloc(struct net *net, struct socket *sock, struct proto *prot, int proto, gfp_t prio, int kern); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 67604ccec2f4..bdce9270487d 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -185,6 +185,28 @@ void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk) } EXPORT_SYMBOL(bt_sock_unlink); +bool bt_sock_linked(struct bt_sock_list *l, struct sock *s) +{ + struct sock *sk; + + if (!l || !s) + return false; + + read_lock(&l->lock); + + sk_for_each(sk, &l->head) { + if (s == sk) { + read_unlock(&l->lock); + return true; + } + } + + read_unlock(&l->lock); + + return false; +} +EXPORT_SYMBOL(bt_sock_linked); + void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh) { const struct cred *old_cred; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index a5ac160c592e..1c7252a36866 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -76,6 +76,16 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) +static struct sock *sco_sock_hold(struct sco_conn *conn) +{ + if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk)) + return NULL; + + sock_hold(conn->sk); + + return conn->sk; +} + static void sco_sock_timeout(struct work_struct *work) { struct sco_conn *conn = container_of(work, struct sco_conn, @@ -87,9 +97,7 @@ static void sco_sock_timeout(struct work_struct *work) sco_conn_unlock(conn); return; } - sk = conn->sk; - if (sk) - sock_hold(sk); + sk = sco_sock_hold(conn); sco_conn_unlock(conn); if (!sk) @@ -194,9 +202,7 @@ static void sco_conn_del(struct hci_conn *hcon, int err) /* Kill socket */ sco_conn_lock(conn); - sk = conn->sk; - if (sk) - sock_hold(sk); + sk = sco_sock_hold(conn); sco_conn_unlock(conn); if (sk) { From 246b435ad668596aa0e2bbb9d491b6413861211a Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 22 Oct 2024 15:35:49 -0400 Subject: [PATCH 34/40] Bluetooth: ISO: Fix UAF on iso_sock_timeout conn->sk maybe have been unlinked/freed while waiting for iso_conn_lock so this checks if the conn->sk is still valid by checking if it part of iso_sk_list. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index d5e00d0dd1a0..ebd3e097d72e 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -93,6 +93,16 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, #define ISO_CONN_TIMEOUT (HZ * 40) #define ISO_DISCONN_TIMEOUT (HZ * 2) +static struct sock *iso_sock_hold(struct iso_conn *conn) +{ + if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk)) + return NULL; + + sock_hold(conn->sk); + + return conn->sk; +} + static void iso_sock_timeout(struct work_struct *work) { struct iso_conn *conn = container_of(work, struct iso_conn, @@ -100,9 +110,7 @@ static void iso_sock_timeout(struct work_struct *work) struct sock *sk; iso_conn_lock(conn); - sk = conn->sk; - if (sk) - sock_hold(sk); + sk = iso_sock_hold(conn); iso_conn_unlock(conn); if (!sk) @@ -209,9 +217,7 @@ static void iso_conn_del(struct hci_conn *hcon, int err) /* Kill socket */ iso_conn_lock(conn); - sk = conn->sk; - if (sk) - sock_hold(sk); + sk = iso_sock_hold(conn); iso_conn_unlock(conn); if (sk) { From ee76eb24343bdd5450eb87572865a4d7fffd335b Mon Sep 17 00:00:00 2001 From: Tim Harvey Date: Fri, 18 Oct 2024 09:06:58 -0700 Subject: [PATCH 35/40] net: dsa: microchip: disable EEE for KSZ879x/KSZ877x/KSZ876x The well-known errata regarding EEE not being functional on various KSZ switches has been refactored a few times. Recently the refactoring has excluded several switches that the errata should also apply to. Disable EEE for additional switches with this errata and provide additional comments referring to the public errata document. The original workaround for the errata was applied with a register write to manually disable the EEE feature in MMD 7:60 which was being applied for KSZ9477/KSZ9897/KSZ9567 switch ID's. Then came commit 26dd2974c5b5 ("net: phy: micrel: Move KSZ9477 errata fixes to PHY driver") and commit 6068e6d7ba50 ("net: dsa: microchip: remove KSZ9477 PHY errata handling") which moved the errata from the switch driver to the PHY driver but only for PHY_ID_KSZ9477 (PHY ID) however that PHY code was dead code because an entry was never added for PHY_ID_KSZ9477 via MODULE_DEVICE_TABLE. This was apparently realized much later and commit 54a4e5c16382 ("net: phy: micrel: add Microchip KSZ 9477 to the device table") added the PHY_ID_KSZ9477 to the PHY driver but as the errata was only being applied to PHY_ID_KSZ9477 it's not completely clear what switches that relates to. Later commit 6149db4997f5 ("net: phy: micrel: fix KSZ9477 PHY issues after suspend/resume") breaks this again for all but KSZ9897 by only applying the errata for that PHY ID. Following that this was affected with commit 08c6d8bae48c("net: phy: Provide Module 4 KSZ9477 errata (DS80000754C)") which removes the blatant register write to MMD 7:60 and replaces it by setting phydev->eee_broken_modes = -1 so that the generic phy-c45 code disables EEE but this is only done for the KSZ9477_CHIP_ID (Switch ID). Lastly commit 0411f73c13af ("net: dsa: microchip: disable EEE for KSZ8567/KSZ9567/KSZ9896/KSZ9897.") adds some additional switches that were missing to the errata due to the previous changes. This commit adds an additional set of switches. Fixes: 0411f73c13af ("net: dsa: microchip: disable EEE for KSZ8567/KSZ9567/KSZ9896/KSZ9897.") Signed-off-by: Tim Harvey Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20241018160658.781564-1-tharvey@gateworks.com Signed-off-by: Paolo Abeni --- drivers/net/dsa/microchip/ksz_common.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 4e8710c7cb7b..5290f5ad98f3 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -2733,26 +2733,27 @@ static u32 ksz_get_phy_flags(struct dsa_switch *ds, int port) return MICREL_KSZ8_P1_ERRATA; break; case KSZ8567_CHIP_ID: + /* KSZ8567R Errata DS80000752C Module 4 */ + case KSZ8765_CHIP_ID: + case KSZ8794_CHIP_ID: + case KSZ8795_CHIP_ID: + /* KSZ879x/KSZ877x/KSZ876x Errata DS80000687C Module 2 */ case KSZ9477_CHIP_ID: + /* KSZ9477S Errata DS80000754A Module 4 */ case KSZ9567_CHIP_ID: + /* KSZ9567S Errata DS80000756A Module 4 */ case KSZ9896_CHIP_ID: + /* KSZ9896C Errata DS80000757A Module 3 */ case KSZ9897_CHIP_ID: - /* KSZ9477 Errata DS80000754C - * - * Module 4: Energy Efficient Ethernet (EEE) feature select must - * be manually disabled + /* KSZ9897R Errata DS80000758C Module 4 */ + /* Energy Efficient Ethernet (EEE) feature select must be manually disabled * The EEE feature is enabled by default, but it is not fully * operational. It must be manually disabled through register * controls. If not disabled, the PHY ports can auto-negotiate * to enable EEE, and this feature can cause link drops when * linked to another device supporting EEE. * - * The same item appears in the errata for the KSZ9567, KSZ9896, - * and KSZ9897. - * - * A similar item appears in the errata for the KSZ8567, but - * provides an alternative workaround. For now, use the simple - * workaround of disabling the EEE feature for this device too. + * The same item appears in the errata for all switches above. */ return MICREL_NO_EEE; } From 4c262801ea60c518b5bebc22a09f5b78b3147da2 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 18 Oct 2024 11:25:22 -0700 Subject: [PATCH 36/40] hv_netvsc: Fix VF namespace also in synthetic NIC NETDEV_REGISTER event The existing code moves VF to the same namespace as the synthetic NIC during netvsc_register_vf(). But, if the synthetic device is moved to a new namespace after the VF registration, the VF won't be moved together. To make the behavior more consistent, add a namespace check for synthetic NIC's NETDEV_REGISTER event (generated during its move), and move the VF if it is not in the same namespace. Cc: stable@vger.kernel.org Fixes: c0a41b887ce6 ("hv_netvsc: move VF to same namespace as netvsc device") Suggested-by: Stephen Hemminger Signed-off-by: Haiyang Zhang Reviewed-by: Simon Horman Link: https://patch.msgid.link/1729275922-17595-1-git-send-email-haiyangz@microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/hyperv/netvsc_drv.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 153b97f8ec0d..23180f7b67b6 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2798,6 +2798,31 @@ static struct hv_driver netvsc_drv = { }, }; +/* Set VF's namespace same as the synthetic NIC */ +static void netvsc_event_set_vf_ns(struct net_device *ndev) +{ + struct net_device_context *ndev_ctx = netdev_priv(ndev); + struct net_device *vf_netdev; + int ret; + + vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); + if (!vf_netdev) + return; + + if (!net_eq(dev_net(ndev), dev_net(vf_netdev))) { + ret = dev_change_net_namespace(vf_netdev, dev_net(ndev), + "eth%d"); + if (ret) + netdev_err(vf_netdev, + "Cannot move to same namespace as %s: %d\n", + ndev->name, ret); + else + netdev_info(vf_netdev, + "Moved VF to namespace with: %s\n", + ndev->name); + } +} + /* * On Hyper-V, every VF interface is matched with a corresponding * synthetic interface. The synthetic interface is presented first @@ -2810,6 +2835,11 @@ static int netvsc_netdev_event(struct notifier_block *this, struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); int ret = 0; + if (event_dev->netdev_ops == &device_ops && event == NETDEV_REGISTER) { + netvsc_event_set_vf_ns(event_dev); + return NOTIFY_DONE; + } + ret = check_dev_is_matching_vf(event_dev); if (ret != 0) return NOTIFY_DONE; From 64761c980cbf71fb7a532a8c7299907ea972a88c Mon Sep 17 00:00:00 2001 From: Reinhard Speyerer Date: Fri, 18 Oct 2024 22:52:55 +0200 Subject: [PATCH 37/40] net: usb: qmi_wwan: add Fibocom FG132 0x0112 composition Add Fibocom FG132 0x0112 composition: T: Bus=03 Lev=02 Prnt=06 Port=01 Cnt=02 Dev#= 10 Spd=12 MxCh= 0 D: Ver= 2.01 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=2cb7 ProdID=0112 Rev= 5.15 S: Manufacturer=Fibocom Wireless Inc. S: Product=Fibocom Module S: SerialNumber=xxxxxxxx C:* #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=50 Driver=qmi_wwan E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl=32ms E: Ad=81(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=01(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=option E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=83(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=option E: Ad=85(I) Atr=03(Int.) MxPS= 10 Ivl=32ms E: Ad=84(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=03(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option E: Ad=86(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=04(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms Signed-off-by: Reinhard Speyerer Link: https://patch.msgid.link/ZxLKp5YZDy-OM0-e@arcor.de Signed-off-by: Paolo Abeni --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 4823dbdf5465..f137c82f1c0f 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1426,6 +1426,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2c7c, 0x0296, 4)}, /* Quectel BG96 */ {QMI_QUIRK_SET_DTR(0x2c7c, 0x030e, 4)}, /* Quectel EM05GV2 */ {QMI_QUIRK_SET_DTR(0x2cb7, 0x0104, 4)}, /* Fibocom NL678 series */ + {QMI_QUIRK_SET_DTR(0x2cb7, 0x0112, 0)}, /* Fibocom FG132 */ {QMI_FIXED_INTF(0x0489, 0xe0b4, 0)}, /* Foxconn T77W968 LTE */ {QMI_FIXED_INTF(0x0489, 0xe0b5, 0)}, /* Foxconn T77W968 LTE with eSIM support*/ {QMI_FIXED_INTF(0x2692, 0x9025, 4)}, /* Cellient MPL200 (rebranded Qualcomm 05c6:9025) */ From 67af86afff74c914944374a103c04e4d9868dd15 Mon Sep 17 00:00:00 2001 From: Shenghao Yang Date: Sun, 20 Oct 2024 14:38:28 +0800 Subject: [PATCH 38/40] net: dsa: mv88e6xxx: group cycle counter coefficients Instead of having them as individual fields in ptp_ops, wrap the coefficients in a separate struct so they can be referenced together. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Shenghao Yang Reviewed-by: Andrew Lunn Signed-off-by: Paolo Abeni --- drivers/net/dsa/mv88e6xxx/chip.h | 6 ++-- drivers/net/dsa/mv88e6xxx/ptp.c | 59 ++++++++++++++++---------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index c34caf9815c5..1d003a9deafa 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -206,6 +206,7 @@ struct mv88e6xxx_gpio_ops; struct mv88e6xxx_avb_ops; struct mv88e6xxx_ptp_ops; struct mv88e6xxx_pcs_ops; +struct mv88e6xxx_cc_coeffs; struct mv88e6xxx_irq { u16 masked; @@ -731,10 +732,7 @@ struct mv88e6xxx_ptp_ops { int arr1_sts_reg; int dep_sts_reg; u32 rx_filters; - u32 cc_shift; - u32 cc_mult; - u32 cc_mult_num; - u32 cc_mult_dem; + const struct mv88e6xxx_cc_coeffs *cc_coeffs; }; struct mv88e6xxx_pcs_ops { diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index 56391e09b325..641af44e00af 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -18,6 +18,13 @@ #define MV88E6XXX_MAX_ADJ_PPB 1000000 +struct mv88e6xxx_cc_coeffs { + u32 cc_shift; + u32 cc_mult; + u32 cc_mult_num; + u32 cc_mult_dem; +}; + /* Family MV88E6250: * Raw timestamps are in units of 10-ns clock periods. * @@ -25,10 +32,13 @@ * simplifies to * clkadj = scaled_ppm * 2^7 / 5^5 */ -#define MV88E6250_CC_SHIFT 28 -#define MV88E6250_CC_MULT (10 << MV88E6250_CC_SHIFT) -#define MV88E6250_CC_MULT_NUM (1 << 7) -#define MV88E6250_CC_MULT_DEM 3125ULL +#define MV88E6250_CC_SHIFT 28 +static const struct mv88e6xxx_cc_coeffs mv88e6250_cc_coeffs = { + .cc_shift = MV88E6250_CC_SHIFT, + .cc_mult = 10 << MV88E6250_CC_SHIFT, + .cc_mult_num = 1 << 7, + .cc_mult_dem = 3125ULL, +}; /* Other families: * Raw timestamps are in units of 8-ns clock periods. @@ -37,10 +47,13 @@ * simplifies to * clkadj = scaled_ppm * 2^9 / 5^6 */ -#define MV88E6XXX_CC_SHIFT 28 -#define MV88E6XXX_CC_MULT (8 << MV88E6XXX_CC_SHIFT) -#define MV88E6XXX_CC_MULT_NUM (1 << 9) -#define MV88E6XXX_CC_MULT_DEM 15625ULL +#define MV88E6XXX_CC_SHIFT 28 +static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_coeffs = { + .cc_shift = MV88E6XXX_CC_SHIFT, + .cc_mult = 8 << MV88E6XXX_CC_SHIFT, + .cc_mult_num = 1 << 9, + .cc_mult_dem = 15625ULL +}; #define TAI_EVENT_WORK_INTERVAL msecs_to_jiffies(100) @@ -214,10 +227,10 @@ static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) scaled_ppm = -scaled_ppm; } - mult = ptp_ops->cc_mult; - adj = ptp_ops->cc_mult_num; + mult = ptp_ops->cc_coeffs->cc_mult; + adj = ptp_ops->cc_coeffs->cc_mult_num; adj *= scaled_ppm; - diff = div_u64(adj, ptp_ops->cc_mult_dem); + diff = div_u64(adj, ptp_ops->cc_coeffs->cc_mult_dem); mv88e6xxx_reg_lock(chip); @@ -364,10 +377,7 @@ const struct mv88e6xxx_ptp_ops mv88e6165_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_shift = MV88E6XXX_CC_SHIFT, - .cc_mult = MV88E6XXX_CC_MULT, - .cc_mult_num = MV88E6XXX_CC_MULT_NUM, - .cc_mult_dem = MV88E6XXX_CC_MULT_DEM, + .cc_coeffs = &mv88e6xxx_cc_coeffs }; const struct mv88e6xxx_ptp_ops mv88e6250_ptp_ops = { @@ -391,10 +401,7 @@ const struct mv88e6xxx_ptp_ops mv88e6250_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_shift = MV88E6250_CC_SHIFT, - .cc_mult = MV88E6250_CC_MULT, - .cc_mult_num = MV88E6250_CC_MULT_NUM, - .cc_mult_dem = MV88E6250_CC_MULT_DEM, + .cc_coeffs = &mv88e6250_cc_coeffs, }; const struct mv88e6xxx_ptp_ops mv88e6352_ptp_ops = { @@ -418,10 +425,7 @@ const struct mv88e6xxx_ptp_ops mv88e6352_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_shift = MV88E6XXX_CC_SHIFT, - .cc_mult = MV88E6XXX_CC_MULT, - .cc_mult_num = MV88E6XXX_CC_MULT_NUM, - .cc_mult_dem = MV88E6XXX_CC_MULT_DEM, + .cc_coeffs = &mv88e6xxx_cc_coeffs, }; const struct mv88e6xxx_ptp_ops mv88e6390_ptp_ops = { @@ -446,10 +450,7 @@ const struct mv88e6xxx_ptp_ops mv88e6390_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_shift = MV88E6XXX_CC_SHIFT, - .cc_mult = MV88E6XXX_CC_MULT, - .cc_mult_num = MV88E6XXX_CC_MULT_NUM, - .cc_mult_dem = MV88E6XXX_CC_MULT_DEM, + .cc_coeffs = &mv88e6xxx_cc_coeffs, }; static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc) @@ -487,8 +488,8 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip) memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc)); chip->tstamp_cc.read = mv88e6xxx_ptp_clock_read; chip->tstamp_cc.mask = CYCLECOUNTER_MASK(32); - chip->tstamp_cc.mult = ptp_ops->cc_mult; - chip->tstamp_cc.shift = ptp_ops->cc_shift; + chip->tstamp_cc.mult = ptp_ops->cc_coeffs->cc_mult; + chip->tstamp_cc.shift = ptp_ops->cc_coeffs->cc_shift; timecounter_init(&chip->tstamp_tc, &chip->tstamp_cc, ktime_to_ns(ktime_get_real())); From 7e3c18097a709e9b958e721066e5fe76e563739b Mon Sep 17 00:00:00 2001 From: Shenghao Yang Date: Sun, 20 Oct 2024 14:38:29 +0800 Subject: [PATCH 39/40] net: dsa: mv88e6xxx: read cycle counter period from hardware Instead of relying on a fixed mapping of hardware family to cycle counter frequency, pull this information from the MV88E6XXX_TAI_CLOCK_PERIOD register. This lets us support switches whose cycle counter frequencies depend on board design. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Suggested-by: Andrew Lunn Signed-off-by: Shenghao Yang Reviewed-by: Andrew Lunn Signed-off-by: Paolo Abeni --- drivers/net/dsa/mv88e6xxx/chip.h | 2 +- drivers/net/dsa/mv88e6xxx/ptp.c | 60 ++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 19 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.h b/drivers/net/dsa/mv88e6xxx/chip.h index 1d003a9deafa..a54682240839 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.h +++ b/drivers/net/dsa/mv88e6xxx/chip.h @@ -409,6 +409,7 @@ struct mv88e6xxx_chip { struct cyclecounter tstamp_cc; struct timecounter tstamp_tc; struct delayed_work overflow_work; + const struct mv88e6xxx_cc_coeffs *cc_coeffs; struct ptp_clock *ptp_clock; struct ptp_clock_info ptp_clock_info; @@ -732,7 +733,6 @@ struct mv88e6xxx_ptp_ops { int arr1_sts_reg; int dep_sts_reg; u32 rx_filters; - const struct mv88e6xxx_cc_coeffs *cc_coeffs; }; struct mv88e6xxx_pcs_ops { diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index 641af44e00af..a409b8661fad 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -32,10 +32,10 @@ struct mv88e6xxx_cc_coeffs { * simplifies to * clkadj = scaled_ppm * 2^7 / 5^5 */ -#define MV88E6250_CC_SHIFT 28 -static const struct mv88e6xxx_cc_coeffs mv88e6250_cc_coeffs = { - .cc_shift = MV88E6250_CC_SHIFT, - .cc_mult = 10 << MV88E6250_CC_SHIFT, +#define MV88E6XXX_CC_10NS_SHIFT 28 +static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_10ns_coeffs = { + .cc_shift = MV88E6XXX_CC_10NS_SHIFT, + .cc_mult = 10 << MV88E6XXX_CC_10NS_SHIFT, .cc_mult_num = 1 << 7, .cc_mult_dem = 3125ULL, }; @@ -47,10 +47,10 @@ static const struct mv88e6xxx_cc_coeffs mv88e6250_cc_coeffs = { * simplifies to * clkadj = scaled_ppm * 2^9 / 5^6 */ -#define MV88E6XXX_CC_SHIFT 28 -static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_coeffs = { - .cc_shift = MV88E6XXX_CC_SHIFT, - .cc_mult = 8 << MV88E6XXX_CC_SHIFT, +#define MV88E6XXX_CC_8NS_SHIFT 28 +static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_8ns_coeffs = { + .cc_shift = MV88E6XXX_CC_8NS_SHIFT, + .cc_mult = 8 << MV88E6XXX_CC_8NS_SHIFT, .cc_mult_num = 1 << 9, .cc_mult_dem = 15625ULL }; @@ -96,6 +96,31 @@ static int mv88e6352_set_gpio_func(struct mv88e6xxx_chip *chip, int pin, return chip->info->ops->gpio_ops->set_pctl(chip, pin, func); } +static const struct mv88e6xxx_cc_coeffs * +mv88e6xxx_cc_coeff_get(struct mv88e6xxx_chip *chip) +{ + u16 period_ps; + int err; + + err = mv88e6xxx_tai_read(chip, MV88E6XXX_TAI_CLOCK_PERIOD, &period_ps, 1); + if (err) { + dev_err(chip->dev, "failed to read cycle counter period: %d\n", + err); + return ERR_PTR(err); + } + + switch (period_ps) { + case 8000: + return &mv88e6xxx_cc_8ns_coeffs; + case 10000: + return &mv88e6xxx_cc_10ns_coeffs; + default: + dev_err(chip->dev, "unexpected cycle counter period of %u ps\n", + period_ps); + return ERR_PTR(-ENODEV); + } +} + static u64 mv88e6352_ptp_clock_read(const struct cyclecounter *cc) { struct mv88e6xxx_chip *chip = cc_to_chip(cc); @@ -217,7 +242,6 @@ static void mv88e6352_tai_event_work(struct work_struct *ugly) static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) { struct mv88e6xxx_chip *chip = ptp_to_chip(ptp); - const struct mv88e6xxx_ptp_ops *ptp_ops = chip->info->ops->ptp_ops; int neg_adj = 0; u32 diff, mult; u64 adj; @@ -227,10 +251,10 @@ static int mv88e6xxx_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) scaled_ppm = -scaled_ppm; } - mult = ptp_ops->cc_coeffs->cc_mult; - adj = ptp_ops->cc_coeffs->cc_mult_num; + mult = chip->cc_coeffs->cc_mult; + adj = chip->cc_coeffs->cc_mult_num; adj *= scaled_ppm; - diff = div_u64(adj, ptp_ops->cc_coeffs->cc_mult_dem); + diff = div_u64(adj, chip->cc_coeffs->cc_mult_dem); mv88e6xxx_reg_lock(chip); @@ -377,7 +401,6 @@ const struct mv88e6xxx_ptp_ops mv88e6165_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_coeffs = &mv88e6xxx_cc_coeffs }; const struct mv88e6xxx_ptp_ops mv88e6250_ptp_ops = { @@ -401,7 +424,6 @@ const struct mv88e6xxx_ptp_ops mv88e6250_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_coeffs = &mv88e6250_cc_coeffs, }; const struct mv88e6xxx_ptp_ops mv88e6352_ptp_ops = { @@ -425,7 +447,6 @@ const struct mv88e6xxx_ptp_ops mv88e6352_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_coeffs = &mv88e6xxx_cc_coeffs, }; const struct mv88e6xxx_ptp_ops mv88e6390_ptp_ops = { @@ -450,7 +471,6 @@ const struct mv88e6xxx_ptp_ops mv88e6390_ptp_ops = { (1 << HWTSTAMP_FILTER_PTP_V2_EVENT) | (1 << HWTSTAMP_FILTER_PTP_V2_SYNC) | (1 << HWTSTAMP_FILTER_PTP_V2_DELAY_REQ), - .cc_coeffs = &mv88e6xxx_cc_coeffs, }; static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc) @@ -485,11 +505,15 @@ int mv88e6xxx_ptp_setup(struct mv88e6xxx_chip *chip) int i; /* Set up the cycle counter */ + chip->cc_coeffs = mv88e6xxx_cc_coeff_get(chip); + if (IS_ERR(chip->cc_coeffs)) + return PTR_ERR(chip->cc_coeffs); + memset(&chip->tstamp_cc, 0, sizeof(chip->tstamp_cc)); chip->tstamp_cc.read = mv88e6xxx_ptp_clock_read; chip->tstamp_cc.mask = CYCLECOUNTER_MASK(32); - chip->tstamp_cc.mult = ptp_ops->cc_coeffs->cc_mult; - chip->tstamp_cc.shift = ptp_ops->cc_coeffs->cc_shift; + chip->tstamp_cc.mult = chip->cc_coeffs->cc_mult; + chip->tstamp_cc.shift = chip->cc_coeffs->cc_shift; timecounter_init(&chip->tstamp_tc, &chip->tstamp_cc, ktime_to_ns(ktime_get_real())); From 3e65ede526cf4f95636dbc835598d100c7668ab3 Mon Sep 17 00:00:00 2001 From: Shenghao Yang Date: Sun, 20 Oct 2024 14:38:30 +0800 Subject: [PATCH 40/40] net: dsa: mv88e6xxx: support 4000ps cycle counter period The MV88E6393X family of devices can run its cycle counter off an internal 250MHz clock instead of an external 125MHz one. Add support for this cycle counter period by adding another set of coefficients and lowering the periodic cycle counter read interval to compensate for faster overflows at the increased frequency. Otherwise, the PHC runs at 2x real time in userspace and cannot be synchronized. Fixes: de776d0d316f ("net: dsa: mv88e6xxx: add support for mv88e6393x family") Signed-off-by: Shenghao Yang Reviewed-by: Andrew Lunn Signed-off-by: Paolo Abeni --- drivers/net/dsa/mv88e6xxx/ptp.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/ptp.c b/drivers/net/dsa/mv88e6xxx/ptp.c index a409b8661fad..aed4a4b07f34 100644 --- a/drivers/net/dsa/mv88e6xxx/ptp.c +++ b/drivers/net/dsa/mv88e6xxx/ptp.c @@ -40,7 +40,7 @@ static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_10ns_coeffs = { .cc_mult_dem = 3125ULL, }; -/* Other families: +/* Other families except MV88E6393X in internal clock mode: * Raw timestamps are in units of 8-ns clock periods. * * clkadj = scaled_ppm * 8*2^28 / (10^6 * 2^16) @@ -55,6 +55,21 @@ static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_8ns_coeffs = { .cc_mult_dem = 15625ULL }; +/* Family MV88E6393X using internal clock: + * Raw timestamps are in units of 4-ns clock periods. + * + * clkadj = scaled_ppm * 4*2^28 / (10^6 * 2^16) + * simplifies to + * clkadj = scaled_ppm * 2^8 / 5^6 + */ +#define MV88E6XXX_CC_4NS_SHIFT 28 +static const struct mv88e6xxx_cc_coeffs mv88e6xxx_cc_4ns_coeffs = { + .cc_shift = MV88E6XXX_CC_4NS_SHIFT, + .cc_mult = 4 << MV88E6XXX_CC_4NS_SHIFT, + .cc_mult_num = 1 << 8, + .cc_mult_dem = 15625ULL +}; + #define TAI_EVENT_WORK_INTERVAL msecs_to_jiffies(100) #define cc_to_chip(cc) container_of(cc, struct mv88e6xxx_chip, tstamp_cc) @@ -110,6 +125,8 @@ mv88e6xxx_cc_coeff_get(struct mv88e6xxx_chip *chip) } switch (period_ps) { + case 4000: + return &mv88e6xxx_cc_4ns_coeffs; case 8000: return &mv88e6xxx_cc_8ns_coeffs; case 10000: @@ -483,10 +500,10 @@ static u64 mv88e6xxx_ptp_clock_read(const struct cyclecounter *cc) return 0; } -/* With a 125MHz input clock, the 32-bit timestamp counter overflows in ~34.3 +/* With a 250MHz input clock, the 32-bit timestamp counter overflows in ~17.2 * seconds; this task forces periodic reads so that we don't miss any. */ -#define MV88E6XXX_TAI_OVERFLOW_PERIOD (HZ * 16) +#define MV88E6XXX_TAI_OVERFLOW_PERIOD (HZ * 8) static void mv88e6xxx_ptp_overflow_check(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work);