Merge branch 'inet-implement-lockless-rtm_getnetconf-ops'

Eric Dumazet says:

====================
inet: implement lockless RTM_GETNETCONF ops

This series removes RTNL use for RTM_GETNETCONF operations on AF_INET.

- Annotate data-races to avoid possible KCSAN splats.

- "ip -4 netconf show dev XXX" can be implemented without RTNL [1]

- "ip -4 netconf" dumps can be implemented using RCU instead of RTNL [1]

[1] This only refers to RTM_GETNETCONF operation, "ip" command
    also uses RTM_GETLINK dumps which are using RTNL at this moment.
====================

Link: https://lore.kernel.org/r/20240227092411.2315725-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2024-02-28 19:36:41 -08:00
commit 3cbab89268
5 changed files with 81 additions and 90 deletions

View File

@ -53,13 +53,15 @@ struct in_device {
}; };
#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1]) #define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
#define IPV4_DEVCONF_ALL(net, attr) \ #define IPV4_DEVCONF_ALL(net, attr) \
IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr) IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))
static inline int ipv4_devconf_get(struct in_device *in_dev, int index) static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
{ {
index--; index--;
return in_dev->cnf.data[index]; return READ_ONCE(in_dev->cnf.data[index]);
} }
static inline void ipv4_devconf_set(struct in_device *in_dev, int index, static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
@ -67,7 +69,7 @@ static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
{ {
index--; index--;
set_bit(index, in_dev->cnf.state); set_bit(index, in_dev->cnf.state);
in_dev->cnf.data[index] = val; WRITE_ONCE(in_dev->cnf.data[index], val);
} }
static inline void ipv4_devconf_setall(struct in_device *in_dev) static inline void ipv4_devconf_setall(struct in_device *in_dev)
@ -81,18 +83,18 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val)) ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))
#define IN_DEV_ANDCONF(in_dev, attr) \ #define IN_DEV_ANDCONF(in_dev, attr) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr) && \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
IN_DEV_CONF_GET((in_dev), attr)) IN_DEV_CONF_GET((in_dev), attr))
#define IN_DEV_NET_ORCONF(in_dev, net, attr) \ #define IN_DEV_NET_ORCONF(in_dev, net, attr) \
(IPV4_DEVCONF_ALL(net, attr) || \ (IPV4_DEVCONF_ALL_RO(net, attr) || \
IN_DEV_CONF_GET((in_dev), attr)) IN_DEV_CONF_GET((in_dev), attr))
#define IN_DEV_ORCONF(in_dev, attr) \ #define IN_DEV_ORCONF(in_dev, attr) \
IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr) IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)
#define IN_DEV_MAXCONF(in_dev, attr) \ #define IN_DEV_MAXCONF(in_dev, attr) \
(max(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), attr), \ (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
IN_DEV_CONF_GET((in_dev), attr))) IN_DEV_CONF_GET((in_dev), attr)))
#define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING)

View File

@ -1982,7 +1982,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
return -EMSGSIZE; return -EMSGSIZE;
for (i = 0; i < IPV4_DEVCONF_MAX; i++) for (i = 0; i < IPV4_DEVCONF_MAX; i++)
((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i]; ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);
return 0; return 0;
} }
@ -2068,9 +2068,9 @@ static int inet_netconf_msgsize_devconf(int type)
} }
static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
struct ipv4_devconf *devconf, u32 portid, const struct ipv4_devconf *devconf,
u32 seq, int event, unsigned int flags, u32 portid, u32 seq, int event,
int type) unsigned int flags, int type)
{ {
struct nlmsghdr *nlh; struct nlmsghdr *nlh;
struct netconfmsg *ncm; struct netconfmsg *ncm;
@ -2095,27 +2095,28 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
if ((all || type == NETCONFA_FORWARDING) && if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING, nla_put_s32(skb, NETCONFA_FORWARDING,
IPV4_DEVCONF(*devconf, FORWARDING)) < 0) IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
goto nla_put_failure; goto nla_put_failure;
if ((all || type == NETCONFA_RP_FILTER) && if ((all || type == NETCONFA_RP_FILTER) &&
nla_put_s32(skb, NETCONFA_RP_FILTER, nla_put_s32(skb, NETCONFA_RP_FILTER,
IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
goto nla_put_failure; goto nla_put_failure;
if ((all || type == NETCONFA_MC_FORWARDING) && if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING, nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure; goto nla_put_failure;
if ((all || type == NETCONFA_BC_FORWARDING) && if ((all || type == NETCONFA_BC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_BC_FORWARDING, nla_put_s32(skb, NETCONFA_BC_FORWARDING,
IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0) IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
goto nla_put_failure; goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) && if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH, nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure; goto nla_put_failure;
if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) IPV4_DEVCONF_RO(*devconf,
IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
goto nla_put_failure; goto nla_put_failure;
out: out:
@ -2204,21 +2205,20 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
struct netlink_ext_ack *extack) struct netlink_ext_ack *extack)
{ {
struct net *net = sock_net(in_skb->sk); struct net *net = sock_net(in_skb->sk);
struct nlattr *tb[NETCONFA_MAX+1]; struct nlattr *tb[NETCONFA_MAX + 1];
const struct ipv4_devconf *devconf;
struct in_device *in_dev = NULL;
struct net_device *dev = NULL;
struct sk_buff *skb; struct sk_buff *skb;
struct ipv4_devconf *devconf;
struct in_device *in_dev;
struct net_device *dev;
int ifindex; int ifindex;
int err; int err;
err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack); err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
if (err) if (err)
goto errout; return err;
err = -EINVAL;
if (!tb[NETCONFA_IFINDEX]) if (!tb[NETCONFA_IFINDEX])
goto errout; return -EINVAL;
ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
switch (ifindex) { switch (ifindex) {
@ -2229,10 +2229,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
devconf = net->ipv4.devconf_dflt; devconf = net->ipv4.devconf_dflt;
break; break;
default: default:
dev = __dev_get_by_index(net, ifindex); err = -ENODEV;
if (!dev) dev = dev_get_by_index(net, ifindex);
goto errout; if (dev)
in_dev = __in_dev_get_rtnl(dev); in_dev = in_dev_get(dev);
if (!in_dev) if (!in_dev)
goto errout; goto errout;
devconf = &in_dev->cnf; devconf = &in_dev->cnf;
@ -2256,6 +2256,9 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
} }
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout: errout:
if (in_dev)
in_dev_put(in_dev);
dev_put(dev);
return err; return err;
} }
@ -2264,11 +2267,13 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
{ {
const struct nlmsghdr *nlh = cb->nlh; const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
int h, s_h; struct {
int idx, s_idx; unsigned long ifindex;
unsigned int all_default;
} *ctx = (void *)cb->ctx;
const struct in_device *in_dev;
struct net_device *dev; struct net_device *dev;
struct in_device *in_dev; int err = 0;
struct hlist_head *head;
if (cb->strict_check) { if (cb->strict_check) {
struct netlink_ext_ack *extack = cb->extack; struct netlink_ext_ack *extack = cb->extack;
@ -2285,64 +2290,47 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
} }
} }
s_h = cb->args[0]; rcu_read_lock();
s_idx = idx = cb->args[1]; for_each_netdev_dump(net, dev, ctx->ifindex) {
in_dev = __in_dev_get_rcu(dev);
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { if (!in_dev)
idx = 0; continue;
head = &net->dev_index_head[h]; err = inet_netconf_fill_devconf(skb, dev->ifindex,
rcu_read_lock(); &in_dev->cnf,
cb->seq = inet_base_seq(net); NETLINK_CB(cb->skb).portid,
hlist_for_each_entry_rcu(dev, head, index_hlist) { nlh->nlmsg_seq,
if (idx < s_idx) RTM_NEWNETCONF, NLM_F_MULTI,
goto cont; NETCONFA_ALL);
in_dev = __in_dev_get_rcu(dev); if (err < 0)
if (!in_dev)
goto cont;
if (inet_netconf_fill_devconf(skb, dev->ifindex,
&in_dev->cnf,
NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
NETCONFA_ALL) < 0) {
rcu_read_unlock();
goto done;
}
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
idx++;
}
rcu_read_unlock();
}
if (h == NETDEV_HASHENTRIES) {
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all,
NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0)
goto done; goto done;
else
h++;
} }
if (h == NETDEV_HASHENTRIES + 1) { if (ctx->all_default == 0) {
if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT, err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_dflt, net->ipv4.devconf_all,
NETLINK_CB(cb->skb).portid, NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq, nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI, RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL) < 0) NETCONFA_ALL);
if (err < 0)
goto done; goto done;
else ctx->all_default++;
h++; }
if (ctx->all_default == 1) {
err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
net->ipv4.devconf_dflt,
NETLINK_CB(cb->skb).portid,
nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
NETCONFA_ALL);
if (err < 0)
goto done;
ctx->all_default++;
} }
done: done:
cb->args[0] = h; if (err < 0 && likely(skb->len))
cb->args[1] = idx; err = skb->len;
rcu_read_unlock();
return skb->len; return err;
} }
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
@ -2825,5 +2813,6 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0); rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0); rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf, rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
inet_netconf_dump_devconf, 0); inet_netconf_dump_devconf,
RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED);
} }

View File

@ -120,12 +120,12 @@
*/ */
#define IGMP_V1_SEEN(in_dev) \ #define IGMP_V1_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \ ((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen))) time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \ #define IGMP_V2_SEEN(in_dev) \
(IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \ IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \ ((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen))) time_before(jiffies, (in_dev)->mr_v2_seen)))

View File

@ -395,7 +395,7 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d", seq_printf(seq, "\nIp: %d %d",
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);

View File

@ -2313,7 +2313,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (IN_DEV_BFORWARD(in_dev)) if (IN_DEV_BFORWARD(in_dev))
goto make_route; goto make_route;
/* not do cache if bc_forwarding is enabled */ /* not do cache if bc_forwarding is enabled */
if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
do_cache = false; do_cache = false;
goto brd_input; goto brd_input;
} }
@ -2993,7 +2993,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
#ifdef CONFIG_IP_MROUTE #ifdef CONFIG_IP_MROUTE
if (ipv4_is_multicast(dst) && if (ipv4_is_multicast(dst) &&
!ipv4_is_local_multicast(dst) && !ipv4_is_local_multicast(dst) &&
IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
int err = ipmr_get_route(net, skb, int err = ipmr_get_route(net, skb,
fl4->saddr, fl4->daddr, fl4->saddr, fl4->daddr,
r, portid); r, portid);