linux-stable/include/linux/rtnetlink.h
Kirill Tkhai 1a57feb847 net: Introduce net_sem for protection of pernet_list
Currently, the mutex is mostly used to protect pernet operations
list. It orders setup_net() and cleanup_net() with parallel
{un,}register_pernet_operations() calls, so ->exit{,batch} methods
of the same pernet operations are executed for a dying net, as
were used to call ->init methods, even after the net namespace
is unlinked from net_namespace_list in cleanup_net().

But there are several problems with scalability. The first one
is that more than one net can't be created or destroyed
at the same moment on the node. For big machines with many cpus
running many containers it's very sensitive.

The second one is that it's need to synchronize_rcu() after net
is removed from net_namespace_list():

Destroy net_ns:
cleanup_net()
  mutex_lock(&net_mutex)
  list_del_rcu(&net->list)
  synchronize_rcu()                                  <--- Sleep there for ages
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_exit_list(ops, &net_exit_list)
  list_for_each_entry_reverse(ops, &pernet_list, list)
    ops_free_list(ops, &net_exit_list)
  mutex_unlock(&net_mutex)

This primitive is not fast, especially on the systems with many processors
and/or when preemptible RCU is enabled in config. So, all the time, while
cleanup_net() is waiting for RCU grace period, creation of new net namespaces
is not possible, the tasks, who makes it, are sleeping on the same mutex:

Create net_ns:
copy_net_ns()
  mutex_lock_killable(&net_mutex)                    <--- Sleep there for ages

I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
with preemptible RCU enabled after CRIU tests round is finished.

The solution is to convert net_mutex to the rw_semaphore and add fine grain
locks to really small number of pernet_operations, what really need them.

Then, pernet_operations::init/::exit methods, modifying the net-related data,
will require down_read() locking only, while down_write() will be used
for changing pernet_list (i.e., when modules are being loaded and unloaded).

This gives signify performance increase, after all patch set is applied,
like you may see here:

%for i in {1..10000}; do unshare -n bash -c exit; done

*before*
real 1m40,377s
user 0m9,672s
sys 0m19,928s

*after*
real 0m17,007s
user 0m5,311s
sys 0m11,779

(5.8 times faster)

This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
describes the variables it protects, and makes to use, where appropriate.
net_mutex is still present, and next patches will kick it out step-by-step.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
Acked-by: Andrei Vagin <avagin@virtuozzo.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-02-13 10:36:04 -05:00

129 lines
4.2 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/wait.h>
#include <uapi/linux/rtnetlink.h>
extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
u32 group, struct nlmsghdr *nlh, gfp_t flags);
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
u32 id, long expires, u32 error);
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
gfp_t flags, int *new_nsid, int new_ifindex);
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
unsigned change, u32 event,
gfp_t flags, int *new_nsid,
int new_ifindex);
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
gfp_t flags);
/* RTNL is used as a global lock for all changes to network configuration */
extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern wait_queue_head_t netdev_unregistering_wq;
extern struct mutex net_mutex;
extern struct rw_semaphore net_sem;
#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
#else
static inline bool lockdep_rtnl_is_held(void)
{
return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */
/**
* rcu_dereference_rtnl - rcu_dereference with debug checking
* @p: The pointer to read, prior to dereferencing
*
* Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
*/
#define rcu_dereference_rtnl(p) \
rcu_dereference_check(p, lockdep_rtnl_is_held())
/**
* rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking
* @p: The pointer to read, prior to dereference
*
* Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh()
* or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh()
*/
#define rcu_dereference_bh_rtnl(p) \
rcu_dereference_bh_check(p, lockdep_rtnl_is_held())
/**
* rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
* @p: The pointer to read, prior to dereferencing
*
* Return the value of the specified RCU-protected pointer, but omit
* the READ_ONCE(), because caller holds RTNL.
*/
#define rtnl_dereference(p) \
rcu_dereference_protected(p, lockdep_rtnl_is_held())
static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{
return rtnl_dereference(dev->ingress_queue);
}
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);
#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif
#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
#endif
void rtnetlink_init(void);
void __rtnl_unlock(void);
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);
#define ASSERT_RTNL() \
WARN_ONCE(!rtnl_is_locked(), \
"RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__)
extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
int *idx);
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid,
u16 flags);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid);
extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_device *dev, u16 mode,
u32 flags, u32 mask, int nlflags,
u32 filter_mask,
int (*vlan_fill)(struct sk_buff *skb,
struct net_device *dev,
u32 filter_mask));
#endif /* __LINUX_RTNETLINK_H */