linux-stable/include/net/mptcp.h
Paolo Abeni cfde141ea3 mptcp: move option parsing into mptcp_incoming_options()
The mptcp_options_received structure carries several per
packet flags (mp_capable, mp_join, etc.). Such fields must
be cleared on each packet, even on dropped ones or packet
not carrying any MPTCP options, but the current mptcp
code clears them only on TCP option reset.

On several races/corner cases we end-up with stray bits in
incoming options, leading to WARN_ON splats. e.g.:

[  171.164906] Bad mapping: ssn=32714 map_seq=1 map_data_len=32713
[  171.165006] WARNING: CPU: 1 PID: 5026 at net/mptcp/subflow.c:533 warn_bad_map (linux-mptcp/net/mptcp/subflow.c:533 linux-mptcp/net/mptcp/subflow.c:531)
[  171.167632] Modules linked in: ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel geneve ip6_udp_tunnel udp_tunnel macsec macvtap tap ipvlan macvlan 8021q garp mrp xfrm_interface veth netdevsim nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun binfmt_misc intel_rapl_msr intel_rapl_common rfkill kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel joydev virtio_balloon pcspkr i2c_piix4 sunrpc ip_tables xfs libcrc32c crc32c_intel serio_raw virtio_console ata_generic virtio_blk virtio_net net_failover failover ata_piix libata
[  171.199464] CPU: 1 PID: 5026 Comm: repro Not tainted 5.7.0-rc1.mptcp_f227fdf5d388+ #95
[  171.200886] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014
[  171.202546] RIP: 0010:warn_bad_map (linux-mptcp/net/mptcp/subflow.c:533 linux-mptcp/net/mptcp/subflow.c:531)
[  171.206537] Code: c1 ea 03 0f b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 04 84 d2 75 1d 8b 55 3c 44 89 e6 48 c7 c7 20 51 13 95 e8 37 8b 22 fe <0f> 0b 48 83 c4 08 5b 5d 41 5c c3 89 4c 24 04 e8 db d6 94 fe 8b 4c
[  171.220473] RSP: 0018:ffffc90000150560 EFLAGS: 00010282
[  171.221639] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  171.223108] RDX: 0000000000000000 RSI: 0000000000000008 RDI: fffff5200002a09e
[  171.224388] RBP: ffff8880aa6e3c00 R08: 0000000000000001 R09: fffffbfff2ec9955
[  171.225706] R10: ffffffff9764caa7 R11: fffffbfff2ec9954 R12: 0000000000007fca
[  171.227211] R13: ffff8881066f4a7f R14: ffff8880aa6e3c00 R15: 0000000000000020
[  171.228460] FS:  00007f8623719740(0000) GS:ffff88810be00000(0000) knlGS:0000000000000000
[  171.230065] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  171.231303] CR2: 00007ffdab190a50 CR3: 00000001038ea006 CR4: 0000000000160ee0
[  171.232586] Call Trace:
[  171.233109]  <IRQ>
[  171.233531] get_mapping_status (linux-mptcp/net/mptcp/subflow.c:691)
[  171.234371] mptcp_subflow_data_available (linux-mptcp/net/mptcp/subflow.c:736 linux-mptcp/net/mptcp/subflow.c:832)
[  171.238181] subflow_state_change (linux-mptcp/net/mptcp/subflow.c:1085 (discriminator 1))
[  171.239066] tcp_fin (linux-mptcp/net/ipv4/tcp_input.c:4217)
[  171.240123] tcp_data_queue (linux-mptcp/./include/linux/compiler.h:199 linux-mptcp/net/ipv4/tcp_input.c:4822)
[  171.245083] tcp_rcv_established (linux-mptcp/./include/linux/skbuff.h:1785 linux-mptcp/./include/net/tcp.h:1774 linux-mptcp/./include/net/tcp.h:1847 linux-mptcp/net/ipv4/tcp_input.c:5238 linux-mptcp/net/ipv4/tcp_input.c:5730)
[  171.254089] tcp_v4_rcv (linux-mptcp/./include/linux/spinlock.h:393 linux-mptcp/net/ipv4/tcp_ipv4.c:2009)
[  171.258969] ip_protocol_deliver_rcu (linux-mptcp/net/ipv4/ip_input.c:204 (discriminator 1))
[  171.260214] ip_local_deliver_finish (linux-mptcp/./include/linux/rcupdate.h:651 linux-mptcp/net/ipv4/ip_input.c:232)
[  171.261389] ip_local_deliver (linux-mptcp/./include/linux/netfilter.h:307 linux-mptcp/./include/linux/netfilter.h:301 linux-mptcp/net/ipv4/ip_input.c:252)
[  171.265884] ip_rcv (linux-mptcp/./include/linux/netfilter.h:307 linux-mptcp/./include/linux/netfilter.h:301 linux-mptcp/net/ipv4/ip_input.c:539)
[  171.273666] process_backlog (linux-mptcp/./include/linux/rcupdate.h:651 linux-mptcp/net/core/dev.c:6135)
[  171.275328] net_rx_action (linux-mptcp/net/core/dev.c:6572 linux-mptcp/net/core/dev.c:6640)
[  171.280472] __do_softirq (linux-mptcp/./arch/x86/include/asm/jump_label.h:25 linux-mptcp/./include/linux/jump_label.h:200 linux-mptcp/./include/trace/events/irq.h:142 linux-mptcp/kernel/softirq.c:293)
[  171.281379] do_softirq_own_stack (linux-mptcp/arch/x86/entry/entry_64.S:1083)
[  171.282358]  </IRQ>

We could address the issue clearing explicitly the relevant fields
in several places - tcp_parse_option, tcp_fast_parse_options,
possibly others.

Instead we move the MPTCP option parsing into the already existing
mptcp ingress hook, so that we need to clear the fields in a single
place.

This allows us dropping an MPTCP hook from the TCP code and
removing the quite large mptcp_options_received from the tcp_sock
struct. On the flip side, the MPTCP sockets will traverse the
option space twice (in tcp_parse_option() and in
mptcp_incoming_options(). That looks acceptable: we already
do that for syn and 3rd ack packets, plain TCP socket will
benefit from it, and even MPTCP sockets will experience better
code locality, reducing the jumps between TCP and MPTCP code.

v1 -> v2:
 - rebased on current '-net' tree

Fixes: 648ef4b886 ("mptcp: Implement MPTCP receive path")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-04-30 12:23:22 -07:00

208 lines
4.7 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Multipath TCP
*
* Copyright (c) 2017 - 2019, Intel Corporation.
*/
#ifndef __NET_MPTCP_H
#define __NET_MPTCP_H
#include <linux/skbuff.h>
#include <linux/tcp.h>
#include <linux/types.h>
struct seq_file;
/* MPTCP sk_buff extension data */
struct mptcp_ext {
u64 data_ack;
u64 data_seq;
u32 subflow_seq;
u16 data_len;
u8 use_map:1,
dsn64:1,
data_fin:1,
use_ack:1,
ack64:1,
mpc_map:1,
__unused:2;
/* one byte hole */
};
struct mptcp_out_options {
#if IS_ENABLED(CONFIG_MPTCP)
u16 suboptions;
u64 sndr_key;
u64 rcvr_key;
union {
struct in_addr addr;
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct in6_addr addr6;
#endif
};
u8 addr_id;
u64 ahmac;
u8 rm_id;
u8 join_id;
u8 backup;
u32 nonce;
u64 thmac;
u32 token;
u8 hmac[20];
struct mptcp_ext ext_copy;
#endif
};
#ifdef CONFIG_MPTCP
void mptcp_init(void);
static inline bool sk_is_mptcp(const struct sock *sk)
{
return tcp_sk(sk)->is_mptcp;
}
static inline bool rsk_is_mptcp(const struct request_sock *req)
{
return tcp_rsk(req)->is_mptcp;
}
bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
unsigned int *size, struct mptcp_out_options *opts);
bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
struct mptcp_out_options *opts);
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts);
void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct tcp_options_received *opt_rx);
void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts);
/* move the skb extension owership, with the assumption that 'to' is
* newly allocated
*/
static inline void mptcp_skb_ext_move(struct sk_buff *to,
struct sk_buff *from)
{
if (!skb_ext_exist(from, SKB_EXT_MPTCP))
return;
if (WARN_ON_ONCE(to->active_extensions))
skb_ext_put(to);
to->active_extensions = from->active_extensions;
to->extensions = from->extensions;
from->active_extensions = 0;
}
static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext,
const struct mptcp_ext *from_ext)
{
/* MPTCP always clears the ext when adding it to the skb, so
* holes do not bother us here
*/
return !from_ext ||
(to_ext && from_ext &&
!memcmp(from_ext, to_ext, sizeof(struct mptcp_ext)));
}
/* check if skbs can be collapsed.
* MPTCP collapse is allowed if neither @to or @from carry an mptcp data
* mapping, or if the extension of @to is the same as @from.
* Collapsing is not possible if @to lacks an extension, but @from carries one.
*/
static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from)
{
return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP),
skb_ext_find(from, SKB_EXT_MPTCP));
}
bool mptcp_sk_is_subflow(const struct sock *sk);
void mptcp_seq_show(struct seq_file *seq);
#else
static inline void mptcp_init(void)
{
}
static inline bool sk_is_mptcp(const struct sock *sk)
{
return false;
}
static inline bool rsk_is_mptcp(const struct request_sock *req)
{
return false;
}
static inline void mptcp_parse_option(const struct sk_buff *skb,
const unsigned char *ptr, int opsize,
struct tcp_options_received *opt_rx)
{
}
static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
unsigned int *size,
struct mptcp_out_options *opts)
{
return false;
}
static inline void mptcp_rcv_synsent(struct sock *sk)
{
}
static inline bool mptcp_synack_options(const struct request_sock *req,
unsigned int *size,
struct mptcp_out_options *opts)
{
return false;
}
static inline bool mptcp_established_options(struct sock *sk,
struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
struct mptcp_out_options *opts)
{
return false;
}
static inline void mptcp_incoming_options(struct sock *sk,
struct sk_buff *skb,
struct tcp_options_received *opt_rx)
{
}
static inline void mptcp_skb_ext_move(struct sk_buff *to,
const struct sk_buff *from)
{
}
static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from)
{
return true;
}
static inline bool mptcp_sk_is_subflow(const struct sock *sk)
{
return false;
}
static inline void mptcp_seq_show(struct seq_file *seq) { }
#endif /* CONFIG_MPTCP */
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
int mptcpv6_init(void);
void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
#elif IS_ENABLED(CONFIG_IPV6)
static inline int mptcpv6_init(void) { return 0; }
static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
#endif
#endif /* __NET_MPTCP_H */