Merge branch 'mptcp-fixes-for-6-4'

Matthieu Baerts says:

====================
mptcp: fixes for 6.4

Patch 1 correctly handles disconnect() failures that can happen in some
specific cases: now the socket state is set as unconnected as expected.
That fixes an issue introduced in v6.2.

Patch 2 fixes a divide by zero bug in mptcp_recvmsg() with a fix similar
to a recent one from Eric Dumazet for TCP introducing sk_wait_pending
flag. It should address an issue present in MPTCP from almost the
beginning, from v5.9.

Patch 3 fixes a possible list corruption on passive MPJ even if the race
seems very unlikely, better be safe than sorry. The possible issue is
present from v5.17.

Patch 4 consolidates fallback and non fallback state machines to avoid
leaking some MPTCP sockets. The fix is likely needed for versions from
v5.11.

Patch 5 drops code that is no longer used after the introduction of
patch 4/6. This is not really a fix but this patch can probably land in
the -net tree as well not to leave unused code.

Patch 6 ensures listeners are unhashed before updating their sk status
to avoid possible deadlocks when diag info are going to be retrieved
with a lock. Even if it should not be visible with the way we are
currently getting diag info, the issue is present from v5.17.
====================

Link: https://lore.kernel.org/r/20230620-upstream-net-20230620-misc-fixes-for-v6-4-v1-0-f36aa5eae8b9@tessares.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2023-06-21 22:44:59 -07:00
commit 533aa0baaa
4 changed files with 76 additions and 107 deletions

View file

@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
if (err)
return err;
inet_sk_state_store(newsk, TCP_LISTEN);
err = kernel_listen(ssock, backlog);
if (err)
return err;

View file

@ -44,7 +44,7 @@ enum {
static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
static void __mptcp_destroy_sock(struct sock *sk);
static void __mptcp_check_send_data_fin(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
static struct net_device mptcp_napi_dev;
@ -424,8 +424,7 @@ static bool mptcp_pending_data_fin_ack(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
return !__mptcp_check_fallback(msk) &&
((1 << sk->sk_state) &
return ((1 << sk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
msk->write_seq == READ_ONCE(msk->snd_una);
}
@ -583,9 +582,6 @@ static bool mptcp_check_data_fin(struct sock *sk)
u64 rcv_data_fin_seq;
bool ret = false;
if (__mptcp_check_fallback(msk))
return ret;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
* msk->rcv_data_fin was set when parsing the incoming options
@ -623,7 +619,8 @@ static bool mptcp_check_data_fin(struct sock *sk)
}
ret = true;
mptcp_send_ack(msk);
if (!__mptcp_check_fallback(msk))
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
return ret;
@ -850,12 +847,12 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
return true;
}
static void __mptcp_flush_join_list(struct sock *sk)
static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
{
struct mptcp_subflow_context *tmp, *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
list_for_each_entry_safe(subflow, tmp, join_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast(ssk);
@ -897,49 +894,6 @@ bool mptcp_schedule_work(struct sock *sk)
return false;
}
void mptcp_subflow_eof(struct sock *sk)
{
if (!test_and_set_bit(MPTCP_WORK_EOF, &mptcp_sk(sk)->flags))
mptcp_schedule_work(sk);
}
static void mptcp_check_for_eof(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
int receivers = 0;
mptcp_for_each_subflow(msk, subflow)
receivers += !subflow->rx_eof;
if (receivers)
return;
if (!(sk->sk_shutdown & RCV_SHUTDOWN)) {
/* hopefully temporary hack: propagate shutdown status
* to msk, when all subflows agree on it
*/
WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
sk->sk_data_ready(sk);
}
switch (sk->sk_state) {
case TCP_ESTABLISHED:
inet_sk_state_store(sk, TCP_CLOSE_WAIT);
break;
case TCP_FIN_WAIT1:
inet_sk_state_store(sk, TCP_CLOSING);
break;
case TCP_FIN_WAIT2:
inet_sk_state_store(sk, TCP_CLOSE);
break;
default:
return;
}
mptcp_close_wake_up(sk);
}
static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@ -1609,7 +1563,7 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags)
if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk);
if (do_check_data_fin)
__mptcp_check_send_data_fin(sk);
mptcp_check_send_data_fin(sk);
}
static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
@ -1727,7 +1681,13 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
*copied_syn = 0;
} else if (ret && ret != -EINPROGRESS) {
mptcp_disconnect(sk, 0);
/* The disconnect() op called by tcp_sendmsg_fastopen()/
* __inet_stream_connect() can fail, due to looking check,
* see mptcp_disconnect().
* Attempt it again outside the problematic scope.
*/
if (!mptcp_disconnect(sk, 0))
sk->sk_socket->state = SS_UNCONNECTED;
}
inet_sk(sk)->defer_connect = 0;
@ -2158,9 +2118,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
break;
}
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
if (sk->sk_shutdown & RCV_SHUTDOWN) {
/* race breaker: the shutdown could be after the
* previous receive queue check
@ -2389,7 +2346,10 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
if (!dispose_it) {
tcp_disconnect(ssk, 0);
/* The MPTCP code never wait on the subflow sockets, TCP-level
* disconnect should never fail
*/
WARN_ON_ONCE(tcp_disconnect(ssk, 0));
msk->subflow->state = SS_UNCONNECTED;
mptcp_subflow_ctx_reset(subflow);
release_sock(ssk);
@ -2408,13 +2368,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
kfree_rcu(subflow, rcu);
} else {
/* otherwise tcp will dispose of the ssk and subflow ctx */
if (ssk->sk_state == TCP_LISTEN) {
tcp_set_state(ssk, TCP_CLOSE);
mptcp_subflow_queue_clean(sk, ssk);
inet_csk_listen_stop(ssk);
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
}
__tcp_close(ssk, 0);
/* close acquired an extra ref */
@ -2671,16 +2624,12 @@ static void mptcp_worker(struct work_struct *work)
if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
goto unlock;
mptcp_check_data_fin_ack(sk);
mptcp_check_fastclose(msk);
mptcp_pm_nl_work(msk);
if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
mptcp_check_for_eof(msk);
__mptcp_check_send_data_fin(sk);
mptcp_check_send_data_fin(sk);
mptcp_check_data_fin_ack(sk);
mptcp_check_data_fin(sk);
if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
@ -2812,13 +2761,19 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
break;
fallthrough;
case TCP_SYN_SENT:
tcp_disconnect(ssk, O_NONBLOCK);
WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
break;
default:
if (__mptcp_check_fallback(mptcp_sk(sk))) {
pr_debug("Fallback");
ssk->sk_shutdown |= how;
tcp_shutdown(ssk, how);
/* simulate the data_fin ack reception to let the state
* machine move forward
*/
WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
mptcp_schedule_work(sk);
} else {
pr_debug("Sending DATA_FIN on subflow %p", ssk);
tcp_send_ack(ssk);
@ -2858,7 +2813,7 @@ static int mptcp_close_state(struct sock *sk)
return next & TCP_ACTION_FIN;
}
static void __mptcp_check_send_data_fin(struct sock *sk)
static void mptcp_check_send_data_fin(struct sock *sk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
@ -2876,19 +2831,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
WRITE_ONCE(msk->snd_nxt, msk->write_seq);
/* fallback socket will not get data_fin/ack, can move to the next
* state now
*/
if (__mptcp_check_fallback(msk)) {
WRITE_ONCE(msk->snd_una, msk->write_seq);
if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_close_wake_up(sk);
} else if (sk->sk_state == TCP_FIN_WAIT1) {
inet_sk_state_store(sk, TCP_FIN_WAIT2);
}
}
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@ -2908,7 +2850,7 @@ static void __mptcp_wr_shutdown(struct sock *sk)
WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
WRITE_ONCE(msk->snd_data_fin_enable, 1);
__mptcp_check_send_data_fin(sk);
mptcp_check_send_data_fin(sk);
}
static void __mptcp_destroy_sock(struct sock *sk)
@ -2953,10 +2895,24 @@ static __poll_t mptcp_check_readable(struct mptcp_sock *msk)
return EPOLLIN | EPOLLRDNORM;
}
static void mptcp_listen_inuse_dec(struct sock *sk)
static void mptcp_check_listen_stop(struct sock *sk)
{
if (inet_sk_state_load(sk) == TCP_LISTEN)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
struct sock *ssk;
if (inet_sk_state_load(sk) != TCP_LISTEN)
return;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
ssk = mptcp_sk(sk)->first;
if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
return;
lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
mptcp_subflow_queue_clean(sk, ssk);
inet_csk_listen_stop(ssk);
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
tcp_set_state(ssk, TCP_CLOSE);
release_sock(ssk);
}
bool __mptcp_close(struct sock *sk, long timeout)
@ -2969,7 +2925,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
mptcp_listen_inuse_dec(sk);
mptcp_check_listen_stop(sk);
inet_sk_state_store(sk, TCP_CLOSE);
goto cleanup;
}
@ -3073,15 +3029,20 @@ static int mptcp_disconnect(struct sock *sk, int flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
/* Deny disconnect if other threads are blocked in sk_wait_event()
* or inet_wait_for_connect().
*/
if (sk->sk_wait_pending)
return -EBUSY;
/* We are on the fastopen error path. We can't call straight into the
* subflows cleanup code due to lock nesting (we are already under
* msk->firstsocket lock). Do nothing and leave the cleanup to the
* caller.
* msk->firstsocket lock).
*/
if (msk->fastopening)
return 0;
return -EBUSY;
mptcp_listen_inuse_dec(sk);
mptcp_check_listen_stop(sk);
inet_sk_state_store(sk, TCP_CLOSE);
mptcp_stop_timer(sk);
@ -3140,6 +3101,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
#endif
nsk->sk_wait_pending = 0;
__mptcp_init_sock(nsk);
msk = mptcp_sk(nsk);
@ -3327,9 +3289,14 @@ static void mptcp_release_cb(struct sock *sk)
for (;;) {
unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
msk->push_pending;
struct list_head join_list;
if (!flags)
break;
INIT_LIST_HEAD(&join_list);
list_splice_init(&msk->join_list, &join_list);
/* the following actions acquire the subflow socket lock
*
* 1) can't be invoked in atomic scope
@ -3340,8 +3307,9 @@ static void mptcp_release_cb(struct sock *sk)
msk->push_pending = 0;
msk->cb_flags &= ~flags;
spin_unlock_bh(&sk->sk_lock.slock);
if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
__mptcp_flush_join_list(sk);
__mptcp_flush_join_list(sk, &join_list);
if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0);
if (flags & BIT(MPTCP_RETRANSMIT))

View file

@ -113,7 +113,6 @@
/* MPTCP socket atomic flags */
#define MPTCP_NOSPACE 1
#define MPTCP_WORK_RTX 2
#define MPTCP_WORK_EOF 3
#define MPTCP_FALLBACK_DONE 4
#define MPTCP_WORK_CLOSE_SUBFLOW 5
@ -476,14 +475,13 @@ struct mptcp_subflow_context {
send_mp_fail : 1,
send_fastclose : 1,
send_infinite_map : 1,
rx_eof : 1,
remote_key_valid : 1, /* received the peer key from */
disposable : 1, /* ctx can be free at ulp release time */
stale : 1, /* unable to snd/rcv data, do not use for xmit */
local_id_valid : 1, /* local_id is correctly initialized */
valid_csum_seen : 1, /* at least one csum validated */
is_mptfo : 1, /* subflow is doing TFO */
__unused : 8;
__unused : 9;
enum mptcp_data_avail data_avail;
u32 remote_nonce;
u64 thmac;
@ -720,7 +718,6 @@ static inline u64 mptcp_expand_seq(u64 old_seq, u64 cur_seq, bool use_64bit)
void __mptcp_check_push(struct sock *sk, struct sock *ssk);
void __mptcp_data_acked(struct sock *sk);
void __mptcp_error_report(struct sock *sk);
void mptcp_subflow_eof(struct sock *sk);
bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit);
static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
{

View file

@ -1749,14 +1749,16 @@ static void subflow_state_change(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
struct mptcp_sock *msk;
__subflow_state_change(sk);
msk = mptcp_sk(parent);
if (subflow_simultaneous_connect(sk)) {
mptcp_propagate_sndbuf(parent, sk);
mptcp_do_fallback(sk);
mptcp_rcv_space_init(mptcp_sk(parent), sk);
pr_fallback(mptcp_sk(parent));
mptcp_rcv_space_init(msk, sk);
pr_fallback(msk);
subflow->conn_finished = 1;
mptcp_set_connected(parent);
}
@ -1772,11 +1774,12 @@ static void subflow_state_change(struct sock *sk)
subflow_sched_work_if_closed(mptcp_sk(parent), sk);
if (__mptcp_check_fallback(mptcp_sk(parent)) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
mptcp_subflow_eof(parent);
}
/* when the fallback subflow closes the rx side, trigger a 'dummy'
* ingress data fin, so that the msk state will follow along
*/
if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
mptcp_schedule_work(parent);
}
void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)