Merge branch 'tcp-refactor-bhash2'

Kuniyuki Iwashima says:

====================
tcp: Refactor bhash2 and remove sk_bind2_node.

This series refactors code around bhash2 and remove some bhash2-specific
fields; sock.sk_bind2_node, and inet_timewait_sock.tw_bind2_node.

  patch 1      : optimise bind() for non-wildcard v4-mapped-v6 address
  patch 2 -  4 : optimise bind() conflict tests
  patch 5 - 12 : Link bhash2 to bhash and unlink sk from bhash2 to
                 remove sk_bind2_node

The patch 8 will trigger a false-positive error by checkpatch.

v2: resend of https://lore.kernel.org/netdev/20231213082029.35149-1-kuniyu@amazon.com/
  * Rebase on latest net-next
  * Patch 11
    * Add change in inet_diag_dump_icsk() for recent bhash dump patch

v1: https://lore.kernel.org/netdev/20231023190255.39190-1-kuniyu@amazon.com/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2023-12-22 22:15:35 +00:00
commit 5f12303528
8 changed files with 94 additions and 148 deletions

View File

@ -88,7 +88,7 @@ struct inet_bind_bucket {
unsigned short fast_sk_family;
bool fast_ipv6_only;
struct hlist_node node;
struct hlist_head owners;
struct hlist_head bhash2;
};
struct inet_bind2_bucket {
@ -96,22 +96,17 @@ struct inet_bind2_bucket {
int l3mdev;
unsigned short port;
#if IS_ENABLED(CONFIG_IPV6)
unsigned short family;
unsigned short addr_type;
struct in6_addr v6_rcv_saddr;
#define rcv_saddr v6_rcv_saddr.s6_addr32[3]
#else
__be32 rcv_saddr;
#endif
union {
#if IS_ENABLED(CONFIG_IPV6)
struct in6_addr v6_rcv_saddr;
#endif
__be32 rcv_saddr;
};
/* Node in the bhash2 inet_bind_hashbucket chain */
struct hlist_node node;
struct hlist_node bhash_node;
/* List of sockets hashed to this bucket */
struct hlist_head owners;
/* bhash has twsk in owners, but bhash2 has twsk in
* deathrow not to add a member in struct sock_common.
*/
struct hlist_head deathrow;
};
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
@ -241,7 +236,7 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
struct inet_bind2_bucket *
inet_bind2_bucket_create(struct kmem_cache *cachep, struct net *net,
struct inet_bind_hashbucket *head,
unsigned short port, int l3mdev,
struct inet_bind_bucket *tb,
const struct sock *sk);
void inet_bind2_bucket_destroy(struct kmem_cache *cachep,

View File

@ -75,13 +75,9 @@ struct inet_timewait_sock {
struct timer_list tw_timer;
struct inet_bind_bucket *tw_tb;
struct inet_bind2_bucket *tw_tb2;
struct hlist_node tw_bind2_node;
};
#define tw_tclass tw_tos
#define twsk_for_each_bound_bhash2(__tw, list) \
hlist_for_each_entry(__tw, list, tw_bind2_node)
static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk)
{
return (struct inet_timewait_sock *)sk;

View File

@ -784,11 +784,6 @@ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a)
cpu_to_be32(0x0000ffff))) == 0UL;
}
static inline bool ipv6_addr_v4mapped_any(const struct in6_addr *a)
{
return ipv6_addr_v4mapped(a) && ipv4_is_zeronet(a->s6_addr32[3]);
}
static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a)
{
return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]);

View File

@ -352,7 +352,6 @@ struct sk_filter;
* @sk_txtime_report_errors: set report errors mode for SO_TXTIME
* @sk_txtime_unused: unused txtime flags
* @ns_tracker: tracker for netns reference
* @sk_bind2_node: bind node in the bhash2 table
*/
struct sock {
/*
@ -544,7 +543,6 @@ struct sock {
#endif
struct rcu_head sk_rcu;
netns_tracker ns_tracker;
struct hlist_node sk_bind2_node;
};
enum sk_pacing {
@ -873,16 +871,6 @@ static inline void sk_add_bind_node(struct sock *sk,
hlist_add_head(&sk->sk_bind_node, list);
}
static inline void __sk_del_bind2_node(struct sock *sk)
{
__hlist_del(&sk->sk_bind2_node);
}
static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
{
hlist_add_head(&sk->sk_bind2_node, list);
}
#define sk_for_each(__sk, list) \
hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
@ -900,8 +888,6 @@ static inline void sk_add_bind2_node(struct sock *sk, struct hlist_head *list)
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_bhash2(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind2_node)
/**
* sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset

View File

@ -159,8 +159,11 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
if (sk->sk_family == AF_INET6) {
int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
return addr_type != IPV6_ADDR_ANY &&
addr_type != IPV6_ADDR_MAPPED;
if (addr_type == IPV6_ADDR_ANY)
return false;
if (addr_type != IPV6_ADDR_MAPPED)
return true;
}
#endif
return sk->sk_rcv_saddr != htonl(INADDR_ANY);
@ -213,18 +216,9 @@ static bool inet_bhash2_conflict(const struct sock *sk,
bool relax, bool reuseport_cb_ok,
bool reuseport_ok)
{
struct inet_timewait_sock *tw2;
struct sock *sk2;
sk_for_each_bound_bhash2(sk2, &tb2->owners) {
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
reuseport_cb_ok, reuseport_ok))
return true;
}
twsk_for_each_bound_bhash2(tw2, &tb2->deathrow) {
sk2 = (struct sock *)tw2;
sk_for_each_bound(sk2, &tb2->owners) {
if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax,
reuseport_cb_ok, reuseport_ok))
return true;
@ -233,15 +227,20 @@ static bool inet_bhash2_conflict(const struct sock *sk,
return false;
}
#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \
hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \
sk_for_each_bound(sk2, &(__tb2)->owners)
/* This should be called only when the tb and tb2 hashbuckets' locks are held */
static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb,
const struct inet_bind2_bucket *tb2, /* may be null */
bool relax, bool reuseport_ok)
{
bool reuseport_cb_ok;
struct sock_reuseport *reuseport_cb;
kuid_t uid = sock_i_uid((struct sock *)sk);
struct sock_reuseport *reuseport_cb;
bool reuseport_cb_ok;
struct sock *sk2;
rcu_read_lock();
reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
@ -249,32 +248,29 @@ static int inet_csk_bind_conflict(const struct sock *sk,
reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
rcu_read_unlock();
/*
* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners and tb2->owners list belong
* to the same net - the one this bucket belongs to.
*/
if (!inet_use_bhash2_on_bind(sk)) {
struct sock *sk2;
sk_for_each_bound(sk2, &tb->owners)
if (inet_bind_conflict(sk, sk2, uid, relax,
reuseport_cb_ok, reuseport_ok) &&
inet_rcv_saddr_equal(sk, sk2, true))
return true;
return false;
}
/* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
* ipv4) should have been checked already. We need to do these two
* checks separately because their spinlocks have to be acquired/released
* independently of each other, to prevent possible deadlocks
*/
return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok,
reuseport_ok);
if (inet_use_bhash2_on_bind(sk))
return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax,
reuseport_cb_ok, reuseport_ok);
/* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
* in tb->owners and tb2->owners list belong
* to the same net - the one this bucket belongs to.
*/
sk_for_each_bound_bhash(sk2, tb2, tb) {
if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok))
continue;
if (inet_rcv_saddr_equal(sk, sk2, true))
return true;
}
return false;
}
/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
@ -457,7 +453,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
kuid_t uid = sock_i_uid(sk);
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
if (hlist_empty(&tb->owners)) {
if (hlist_empty(&tb->bhash2)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
@ -549,7 +545,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
}
if (!found_port) {
if (!hlist_empty(&tb->owners)) {
if (!hlist_empty(&tb->bhash2)) {
if (sk->sk_reuse == SK_FORCE_REUSE ||
(tb->fastreuse > 0 && reuse) ||
sk_reuseport_match(tb, sk))
@ -569,7 +565,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
net, head2, port, l3mdev, sk);
net, head2, tb, sk);
if (!tb2)
goto fail_unlock;
bhash2_created = true;
@ -591,11 +587,10 @@ success:
fail_unlock:
if (ret) {
if (bhash2_created)
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
if (bhash_created)
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
if (bhash2_created)
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep,
tb2);
}
if (head2_lock_acquired)
spin_unlock(&head2->lock);

View File

@ -1104,7 +1104,7 @@ resume_bind_walk:
if (!net_eq(ib2_net(tb2), net))
continue;
sk_for_each_bound_bhash2(sk, &tb2->owners) {
sk_for_each_bound(sk, &tb2->owners) {
struct inet_sock *inet = inet_sk(sk);
if (num < s_num)

View File

@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
INIT_HLIST_HEAD(&tb->owners);
INIT_HLIST_HEAD(&tb->bhash2);
hlist_add_head(&tb->node, &head->chain);
}
return tb;
@ -87,7 +87,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
*/
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
{
if (hlist_empty(&tb->owners)) {
if (hlist_empty(&tb->bhash2)) {
__hlist_del(&tb->node);
kmem_cache_free(cachep, tb);
}
@ -100,47 +100,52 @@ bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net
tb->l3mdev == l3mdev;
}
static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
struct net *net,
struct inet_bind_hashbucket *head,
unsigned short port, int l3mdev,
struct inet_bind_bucket *tb,
const struct sock *sk)
{
write_pnet(&tb->ib_net, net);
tb->l3mdev = l3mdev;
tb->port = port;
write_pnet(&tb2->ib_net, net);
tb2->l3mdev = tb->l3mdev;
tb2->port = tb->port;
#if IS_ENABLED(CONFIG_IPV6)
tb->family = sk->sk_family;
if (sk->sk_family == AF_INET6)
tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
else
BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
if (sk->sk_family == AF_INET6) {
tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
} else {
tb2->addr_type = IPV6_ADDR_MAPPED;
ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
}
#else
tb2->rcv_saddr = sk->sk_rcv_saddr;
#endif
tb->rcv_saddr = sk->sk_rcv_saddr;
INIT_HLIST_HEAD(&tb->owners);
INIT_HLIST_HEAD(&tb->deathrow);
hlist_add_head(&tb->node, &head->chain);
INIT_HLIST_HEAD(&tb2->owners);
hlist_add_head(&tb2->node, &head->chain);
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
}
struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
unsigned short port,
int l3mdev,
struct inet_bind_bucket *tb,
const struct sock *sk)
{
struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb)
inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk);
if (tb2)
inet_bind2_bucket_init(tb2, net, head, tb, sk);
return tb;
return tb2;
}
/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
__hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb);
}
}
@ -149,18 +154,11 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family != tb2->family) {
if (sk->sk_family == AF_INET)
return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) &&
tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr;
return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) &&
sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr;
}
if (sk->sk_family == AF_INET6)
return ipv6_addr_equal(&tb2->v6_rcv_saddr,
&sk->sk_v6_rcv_saddr);
return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
if (tb2->addr_type != IPV6_ADDR_MAPPED)
return false;
#endif
return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
@ -169,10 +167,9 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
struct inet_bind2_bucket *tb2, unsigned short port)
{
inet_sk(sk)->inet_num = port;
sk_add_bind_node(sk, &tb->owners);
inet_csk(sk)->icsk_bind_hash = tb;
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
sk_add_bind_node(sk, &tb2->owners);
}
/*
@ -192,21 +189,20 @@ static void __inet_put_port(struct sock *sk)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
__sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
spin_lock(&head2->lock);
if (inet_csk(sk)->icsk_bind2_hash) {
struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
__sk_del_bind2_node(sk);
__sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind2_hash = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
}
spin_unlock(&head2->lock);
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
}
@ -275,8 +271,7 @@ bhash2_find:
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
if (!tb2) {
tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
net, head2, port,
l3mdev, child);
net, head2, tb, child);
if (!tb2)
goto error;
}
@ -836,16 +831,15 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const
return false;
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family != tb->family) {
if (sk->sk_family == AF_INET)
return ipv6_addr_any(&tb->v6_rcv_saddr) ||
ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr);
if (tb->addr_type == IPV6_ADDR_ANY)
return true;
if (tb->addr_type != IPV6_ADDR_MAPPED)
return false;
}
if (sk->sk_family == AF_INET6)
return ipv6_addr_any(&tb->v6_rcv_saddr);
if (sk->sk_family == AF_INET6 &&
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
return false;
#endif
return tb->rcv_saddr == 0;
}
@ -942,7 +936,7 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family,
spin_lock_bh(&head->lock);
spin_lock(&head2->lock);
__sk_del_bind2_node(sk);
__sk_del_bind_node(sk);
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
spin_unlock(&head2->lock);
@ -957,10 +951,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family,
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
if (!tb2) {
tb2 = new_tb2;
inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk);
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
}
sk_add_bind2_node(sk, &tb2->owners);
inet_csk(sk)->icsk_bind2_hash = tb2;
sk_add_bind_node(sk, &tb2->owners);
spin_unlock(&head2->lock);
spin_unlock_bh(&head->lock);
@ -1064,7 +1058,7 @@ other_parity_scan:
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
WARN_ON(hlist_empty(&tb->bhash2));
if (!check_established(death_row, sk,
port, &tw))
goto ok;
@ -1104,7 +1098,7 @@ ok:
tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
if (!tb2) {
tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
head2, port, l3mdev, sk);
head2, tb, sk);
if (!tb2)
goto error;
}

View File

@ -35,13 +35,11 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
if (!tb)
return;
__hlist_del(&tw->tw_bind_node);
__sk_del_bind_node((struct sock *)tw);
tw->tw_tb = NULL;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
__hlist_del(&tw->tw_bind2_node);
tw->tw_tb2 = NULL;
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
__sock_put((struct sock *)tw);
}
@ -94,18 +92,6 @@ static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
hlist_nulls_add_head_rcu(&tw->tw_node, list);
}
static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind_node, list);
}
static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
struct hlist_head *list)
{
hlist_add_head(&tw->tw_bind2_node, list);
}
/*
* Enter the time wait state. This is called with locally disabled BH.
* Essentially we whip up a timewait bucket, copy the relevant info into it
@ -133,11 +119,10 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
tw->tw_tb2 = icsk->icsk_bind2_hash;
WARN_ON(!icsk->icsk_bind2_hash);
inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);
spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);