mptcp: invoke MP_FAIL response when needed

mptcp_mp_fail_no_response shouldn't be invoked on each worker run, it
should be invoked only when MP_FAIL response timeout occurs.

This patch refactors the MP_FAIL response logic.

It leverages the fact that only the MPC/first subflow can gracefully
fail to avoid unneeded subflows traversal: the failing subflow can
be only msk->first.

A new 'fail_tout' field is added to the subflow context to record the
MP_FAIL response timeout and use such field to reliably share the
timeout timer between the MP_FAIL event and the MPTCP socket close
timeout.

Finally, a new ack is generated to send out MP_FAIL notification as soon
as we hit the relevant condition, instead of waiting a possibly unbound
time for the next data packet.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/281
Fixes: d9fb797046 ("mptcp: Do not traverse the subflow connection list without lock")
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Geliang Tang <geliang.tang@suse.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Geliang Tang 2022-06-27 18:02:37 -07:00 committed by Jakub Kicinski
parent 31bf11de14
commit 76a13b3157
4 changed files with 82 additions and 45 deletions

View file

@ -299,22 +299,21 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct sock *s = (struct sock *)msk;
pr_debug("fail_seq=%llu", fail_seq);
if (!READ_ONCE(msk->allow_infinite_fallback))
return;
if (!READ_ONCE(subflow->mp_fail_response_expect)) {
if (!subflow->fail_tout) {
pr_debug("send MP_FAIL response and infinite map");
subflow->send_mp_fail = 1;
subflow->send_infinite_map = 1;
} else if (!sock_flag(sk, SOCK_DEAD)) {
tcp_send_ack(sk);
} else {
pr_debug("MP_FAIL response received");
sk_stop_timer(s, &s->sk_timer);
WRITE_ONCE(subflow->fail_tout, 0);
}
}

View file

@ -500,7 +500,7 @@ static void mptcp_set_timeout(struct sock *sk)
__mptcp_set_timeout(sk, tout);
}
static bool tcp_can_send_ack(const struct sock *ssk)
static inline bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
(TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN));
@ -2175,21 +2175,6 @@ static void mptcp_retransmit_timer(struct timer_list *t)
sock_put(sk);
}
static struct mptcp_subflow_context *
mp_fail_response_expect_subflow(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow, *ret = NULL;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->mp_fail_response_expect)) {
ret = subflow;
break;
}
}
return ret;
}
static void mptcp_timeout_timer(struct timer_list *t)
{
struct sock *sk = from_timer(sk, t, sk_timer);
@ -2518,27 +2503,50 @@ static void __mptcp_retrans(struct sock *sk)
mptcp_reset_timer(sk);
}
/* schedule the timeout timer for the relevant event: either close timeout
* or mp_fail timeout. The close timeout takes precedence on the mp_fail one
*/
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout)
{
struct sock *sk = (struct sock *)msk;
unsigned long timeout, close_timeout;
if (!fail_tout && !sock_flag(sk, SOCK_DEAD))
return;
close_timeout = inet_csk(sk)->icsk_mtup.probe_timestamp - tcp_jiffies32 + jiffies + TCP_TIMEWAIT_LEN;
/* the close timeout takes precedence on the fail one, and here at least one of
* them is active
*/
timeout = sock_flag(sk, SOCK_DEAD) ? close_timeout : fail_tout;
sk_reset_timer(sk, &sk->sk_timer, timeout);
}
static void mptcp_mp_fail_no_response(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct sock *ssk;
struct sock *ssk = msk->first;
bool slow;
subflow = mp_fail_response_expect_subflow(msk);
if (subflow) {
pr_debug("MP_FAIL doesn't respond, reset the subflow");
if (!ssk)
return;
ssk = mptcp_subflow_tcp_sock(subflow);
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
unlock_sock_fast(ssk, slow);
}
pr_debug("MP_FAIL doesn't respond, reset the subflow");
slow = lock_sock_fast(ssk);
mptcp_subflow_reset(ssk);
WRITE_ONCE(mptcp_subflow_ctx(ssk)->fail_tout, 0);
unlock_sock_fast(ssk, slow);
mptcp_reset_timeout(msk, 0);
}
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
struct sock *sk = &msk->sk.icsk_inet.sk;
unsigned long fail_tout;
int state;
lock_sock(sk);
@ -2575,7 +2583,9 @@ static void mptcp_worker(struct work_struct *work)
if (test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
__mptcp_retrans(sk);
mptcp_mp_fail_no_response(msk);
fail_tout = msk->first ? READ_ONCE(mptcp_subflow_ctx(msk->first)->fail_tout) : 0;
if (fail_tout && time_after(jiffies, fail_tout))
mptcp_mp_fail_no_response(msk);
unlock:
release_sock(sk);
@ -2822,6 +2832,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
static void mptcp_close(struct sock *sk, long timeout)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
lock_sock(sk);
@ -2840,10 +2851,16 @@ static void mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
/* since the close timeout takes precedence on the fail one,
* cancel the latter
*/
if (ssk == msk->first)
subflow->fail_tout = 0;
sock_orphan(ssk);
unlock_sock_fast(ssk, slow);
}
@ -2852,13 +2869,13 @@ static void mptcp_close(struct sock *sk, long timeout)
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (mptcp_sk(sk)->token)
mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL);
mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL);
if (sk->sk_state == TCP_CLOSE) {
__mptcp_destroy_sock(sk);
do_cancel_work = true;
} else {
sk_reset_timer(sk, &sk->sk_timer, jiffies + TCP_TIMEWAIT_LEN);
mptcp_reset_timeout(msk, 0);
}
release_sock(sk);
if (do_cancel_work)

View file

@ -468,7 +468,6 @@ struct mptcp_subflow_context {
local_id_valid : 1, /* local_id is correctly initialized */
valid_csum_seen : 1; /* at least one csum validated */
enum mptcp_data_avail data_avail;
bool mp_fail_response_expect;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
@ -482,6 +481,7 @@ struct mptcp_subflow_context {
u8 stale_count;
long delegated_status;
unsigned long fail_tout;
);
@ -662,6 +662,7 @@ void mptcp_get_options(const struct sk_buff *skb,
void mptcp_finish_connect(struct sock *sk);
void __mptcp_set_connected(struct sock *sk);
void mptcp_reset_timeout(struct mptcp_sock *msk, unsigned long fail_tout);
static inline bool mptcp_is_fully_established(struct sock *sk)
{
return inet_sk_state_load(sk) == TCP_ESTABLISHED &&

View file

@ -971,7 +971,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
bool csum_reqd = READ_ONCE(msk->csum_enabled);
struct sock *sk = (struct sock *)msk;
struct mptcp_ext *mpext;
struct sk_buff *skb;
u16 data_len;
@ -1013,9 +1012,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk,
pr_debug("infinite mapping received");
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
subflow->map_data_len = 0;
if (!sock_flag(ssk, SOCK_DEAD))
sk_stop_timer(sk, &sk->sk_timer);
return MAPPING_INVALID;
}
@ -1162,6 +1158,33 @@ static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
return !subflow->fully_established;
}
static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
unsigned long fail_tout;
/* greceful failure can happen only on the MPC subflow */
if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
return;
/* since the close timeout take precedence on the fail one,
* no need to start the latter when the first is already set
*/
if (sock_flag((struct sock *)msk, SOCK_DEAD))
return;
/* we don't need extreme accuracy here, use a zero fail_tout as special
* value meaning no fail timeout at all;
*/
fail_tout = jiffies + TCP_RTO_MAX;
if (!fail_tout)
fail_tout = 1;
WRITE_ONCE(subflow->fail_tout, fail_tout);
tcp_send_ack(ssk);
mptcp_reset_timeout(msk, subflow->fail_tout);
}
static bool subflow_check_data_avail(struct sock *ssk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
@ -1236,11 +1259,8 @@ static bool subflow_check_data_avail(struct sock *ssk)
tcp_send_active_reset(ssk, GFP_ATOMIC);
while ((skb = skb_peek(&ssk->sk_receive_queue)))
sk_eat_skb(ssk, skb);
} else if (!sock_flag(ssk, SOCK_DEAD)) {
WRITE_ONCE(subflow->mp_fail_response_expect, true);
sk_reset_timer((struct sock *)msk,
&((struct sock *)msk)->sk_timer,
jiffies + TCP_RTO_MAX);
} else {
mptcp_subflow_fail(msk, ssk);
}
WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
return true;