mptcp: stop relying on tcp_tx_skb_cache

We want to revert the skb TX cache, but MPTCP is currently
using it unconditionally.

Rework the MPTCP tx code, so that tcp_tx_skb_cache is not
needed anymore: do the whole coalescing check, skb allocation
skb initialization/update inside mptcp_sendmsg_frag(), quite
alike the current TCP code.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Paolo Abeni 2021-09-22 19:26:41 +02:00 committed by David S. Miller
parent 04d8825c30
commit f70cad1085

View file

@ -1224,6 +1224,7 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
if (likely(__mptcp_add_ext(skb, gfp))) { if (likely(__mptcp_add_ext(skb, gfp))) {
skb_reserve(skb, MAX_TCP_HEADER); skb_reserve(skb, MAX_TCP_HEADER);
skb->reserved_tailroom = skb->end - skb->tail; skb->reserved_tailroom = skb->end - skb->tail;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb; return skb;
} }
__kfree_skb(skb); __kfree_skb(skb);
@ -1233,31 +1234,23 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp)
return NULL; return NULL;
} }
static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp)
{ {
struct sk_buff *skb; struct sk_buff *skb;
if (ssk->sk_tx_skb_cache) {
skb = ssk->sk_tx_skb_cache;
if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) &&
!__mptcp_add_ext(skb, gfp)))
return false;
return true;
}
skb = __mptcp_do_alloc_tx_skb(sk, gfp); skb = __mptcp_do_alloc_tx_skb(sk, gfp);
if (!skb) if (!skb)
return false; return NULL;
if (likely(sk_wmem_schedule(ssk, skb->truesize))) { if (likely(sk_wmem_schedule(ssk, skb->truesize))) {
ssk->sk_tx_skb_cache = skb; tcp_skb_entail(ssk, skb);
return true; return skb;
} }
kfree_skb(skb); kfree_skb(skb);
return false; return NULL;
} }
static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held)
{ {
gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation;
@ -1287,23 +1280,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info) struct mptcp_sendmsg_info *info)
{ {
u64 data_seq = dfrag->data_seq + info->sent; u64 data_seq = dfrag->data_seq + info->sent;
int offset = dfrag->offset + info->sent;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
bool zero_window_probe = false; bool zero_window_probe = false;
struct mptcp_ext *mpext = NULL; struct mptcp_ext *mpext = NULL;
struct sk_buff *skb, *tail; bool can_coalesce = false;
bool must_collapse = false; bool reuse_skb = true;
int size_bias = 0; struct sk_buff *skb;
int avail_size; size_t copy;
size_t ret = 0; int i;
pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
if (WARN_ON_ONCE(info->sent > info->limit ||
info->limit > dfrag->data_len))
return 0;
/* compute send limit */ /* compute send limit */
info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags);
avail_size = info->size_goal; copy = info->size_goal;
skb = tcp_write_queue_tail(ssk); skb = tcp_write_queue_tail(ssk);
if (skb) { if (skb && copy > skb->len) {
/* Limit the write to the size available in the /* Limit the write to the size available in the
* current skb, if any, so that we create at most a new skb. * current skb, if any, so that we create at most a new skb.
* Explicitly tells TCP internals to avoid collapsing on later * Explicitly tells TCP internals to avoid collapsing on later
@ -1316,62 +1315,80 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
goto alloc_skb; goto alloc_skb;
} }
must_collapse = (info->size_goal > skb->len) && i = skb_shinfo(skb)->nr_frags;
(skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset);
if (must_collapse) { if (!can_coalesce && i >= sysctl_max_skb_frags) {
size_bias = skb->len; tcp_mark_push(tcp_sk(ssk), skb);
avail_size = info->size_goal - skb->len; goto alloc_skb;
} }
}
copy -= skb->len;
} else {
alloc_skb: alloc_skb:
if (!must_collapse && skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held);
!mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) if (!skb)
return 0; return -ENOMEM;
i = skb_shinfo(skb)->nr_frags;
reuse_skb = false;
mpext = skb_ext_find(skb, SKB_EXT_MPTCP);
}
/* Zero window and all data acked? Probe. */ /* Zero window and all data acked? Probe. */
avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); copy = mptcp_check_allowed_size(msk, data_seq, copy);
if (avail_size == 0) { if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una); u64 snd_una = READ_ONCE(msk->snd_una);
if (skb || snd_una != msk->snd_nxt) if (snd_una != msk->snd_nxt) {
tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
return 0; return 0;
}
zero_window_probe = true; zero_window_probe = true;
data_seq = snd_una - 1; data_seq = snd_una - 1;
avail_size = 1; copy = 1;
/* all mptcp-level data is acked, no skbs should be present into the
* ssk write queue
*/
WARN_ON_ONCE(reuse_skb);
} }
if (WARN_ON_ONCE(info->sent > info->limit || copy = min_t(size_t, copy, info->limit - info->sent);
info->limit > dfrag->data_len)) if (!sk_wmem_schedule(ssk, copy)) {
return 0; tcp_remove_empty_skb(ssk, tcp_write_queue_tail(ssk));
ret = info->limit - info->sent;
tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags,
dfrag->page, dfrag->offset + info->sent, &ret);
if (!tail) {
tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk));
return -ENOMEM; return -ENOMEM;
} }
/* if the tail skb is still the cached one, collapsing really happened. if (can_coalesce) {
*/ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
if (skb == tail) { } else {
TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; get_page(dfrag->page);
mpext->data_len += ret; skb_fill_page_desc(skb, i, dfrag->page, offset, copy);
WARN_ON_ONCE(zero_window_probe);
goto out;
} }
mpext = skb_ext_find(tail, SKB_EXT_MPTCP); skb->len += copy;
if (WARN_ON_ONCE(!mpext)) { skb->data_len += copy;
/* should never reach here, stream corrupted */ skb->truesize += copy;
return -EINVAL; sk_wmem_queued_add(ssk, copy);
sk_mem_charge(ssk, copy);
skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
/* on skb reuse we just need to update the DSS len */
if (reuse_skb) {
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
mpext->data_len += copy;
WARN_ON_ONCE(zero_window_probe);
goto out;
} }
memset(mpext, 0, sizeof(*mpext)); memset(mpext, 0, sizeof(*mpext));
mpext->data_seq = data_seq; mpext->data_seq = data_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret; mpext->data_len = copy;
mpext->use_map = 1; mpext->use_map = 1;
mpext->dsn64 = 1; mpext->dsn64 = 1;
@ -1380,18 +1397,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64); mpext->dsn64);
if (zero_window_probe) { if (zero_window_probe) {
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
mpext->frozen = 1; mpext->frozen = 1;
if (READ_ONCE(msk->csum_enabled)) if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(tail, ret); mptcp_update_data_checksum(skb, copy);
tcp_push_pending_frames(ssk); tcp_push_pending_frames(ssk);
return 0; return 0;
} }
out: out:
if (READ_ONCE(msk->csum_enabled)) if (READ_ONCE(msk->csum_enabled))
mptcp_update_data_checksum(tail, ret); mptcp_update_data_checksum(skb, copy);
mptcp_subflow_ctx(ssk)->rel_write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
return ret; return copy;
} }
#define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \