mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-25 11:55:37 +00:00
b261eda84e
Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work with setsockopt(SO_REUSEPORT) since v4.6. With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could build a highly efficient server application. setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener or UDP socket, and then incoming packets processed on the CPU will likely be distributed to the socket. Technically, a socket could even receive packets handled on another CPU if no sockets in the reuseport group have the same CPU receiving the flow. The logic exists in compute_score() so that a socket will get a higher score if it has the same CPU with the flow. However, the score gets ignored after the blamed two commits, which introduced a faster socket selection algorithm for SO_REUSEPORT. This patch introduces a counter of sockets with SO_INCOMING_CPU in a reuseport group to check if we should iterate all sockets to find a proper one. We increment the counter when * calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT * enabling SO_INCOMING_CPU if the socket is in a reuseport group Also, we decrement it when * detaching a socket out of the group to apply SO_INCOMING_CPU to migrated TCP requests * disabling SO_INCOMING_CPU if the socket is in a reuseport group When the counter reaches 0, we can get back to the O(1) selection algorithm. The overall changes are negligible for the non-SO_INCOMING_CPU case, and the only notable thing is that we have to update sk_incomnig_cpu under reuseport_lock. Otherwise, the race prevents transitioning to the O(n) algorithm and results in the wrong socket selection. cpu1 (setsockopt) cpu2 (listen) +-----------------+ +-------------+ lock_sock(sk1) lock_sock(sk2) reuseport_update_incoming_cpu(sk1, val) . | /* set CPU as 0 */ |- WRITE_ONCE(sk1->incoming_cpu, val) | | spin_lock_bh(&reuseport_lock) | reuseport_grow(sk2, reuse) | . | |- more_socks_size = reuse->max_socks * 2U; | |- if (more_socks_size > U16_MAX && | | reuse->num_closed_socks) | | . | | |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL); | | `- __reuseport_detach_closed_sock(sk1, reuse) | | . | | `- reuseport_put_incoming_cpu(sk1, reuse) | | . | | | /* Read shutdown()ed sk1's sk_incoming_cpu | | | * without lock_sock(). | | | */ | | `- if (sk1->sk_incoming_cpu >= 0) | | . | | | /* decrement not-yet-incremented | | | * count, which is never incremented. | | | */ | | `- __reuseport_put_incoming_cpu(reuse); | | | `- spin_lock_bh(&reuseport_lock) | |- spin_lock_bh(&reuseport_lock) | |- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...) |- if (!reuse) | . | | /* Cannot increment reuse->incoming_cpu. */ | `- goto out; | `- spin_unlock_bh(&reuseport_lock) Fixes: |
||
---|---|---|
.. | ||
bpf_sk_storage.c | ||
datagram.c | ||
dev.c | ||
dev.h | ||
dev_addr_lists.c | ||
dev_addr_lists_test.c | ||
dev_ioctl.c | ||
devlink.c | ||
drop_monitor.c | ||
dst.c | ||
dst_cache.c | ||
failover.c | ||
fib_notifier.c | ||
fib_rules.c | ||
filter.c | ||
flow_dissector.c | ||
flow_offload.c | ||
gen_estimator.c | ||
gen_stats.c | ||
gro.c | ||
gro_cells.c | ||
hwbm.c | ||
link_watch.c | ||
lwt_bpf.c | ||
lwtunnel.c | ||
Makefile | ||
neighbour.c | ||
net-procfs.c | ||
net-sysfs.c | ||
net-sysfs.h | ||
net-traces.c | ||
net_namespace.c | ||
netclassid_cgroup.c | ||
netevent.c | ||
netpoll.c | ||
netprio_cgroup.c | ||
of_net.c | ||
page_pool.c | ||
pktgen.c | ||
ptp_classifier.c | ||
request_sock.c | ||
rtnetlink.c | ||
scm.c | ||
secure_seq.c | ||
selftests.c | ||
skbuff.c | ||
skmsg.c | ||
sock.c | ||
sock_destructor.h | ||
sock_diag.c | ||
sock_map.c | ||
sock_reuseport.c | ||
stream.c | ||
sysctl_net_core.c | ||
timestamping.c | ||
tso.c | ||
utils.c | ||
xdp.c |