diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a5e4c813f17f..1b8c964b0d17 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -290,6 +290,28 @@ tcp_frto - INTEGER By default it's enabled with a non-zero value. 0 disables F-RTO. +tcp_invalid_ratelimit - INTEGER + Limit the maximal rate for sending duplicate acknowledgments + in response to incoming TCP packets that are for an existing + connection but that are invalid due to any of these reasons: + + (a) out-of-window sequence number, + (b) out-of-window acknowledgment number, or + (c) PAWS (Protection Against Wrapped Sequence numbers) check failure + + This can help mitigate simple "ack loop" DoS attacks, wherein + a buggy or malicious middlebox or man-in-the-middle can + rewrite TCP header fields in manner that causes each endpoint + to think that the other is sending invalid TCP segments, thus + causing each side to send an unterminating stream of duplicate + acknowledgments for invalid segments. + + Using 0 disables rate-limiting of dupacks in response to + invalid segments; otherwise this value specifies the minimal + space between sending such dupacks, in milliseconds. + + Default: 500 (milliseconds). + tcp_keepalive_time - INTEGER How often TCP sends out keepalive messages when keepalive is enabled. Default: 2hours. diff --git a/include/net/tcp.h b/include/net/tcp.h index 28e9bd3abceb..b81f45c67b2e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -274,6 +274,7 @@ extern int sysctl_tcp_challenge_ack_limit; extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; +extern int sysctl_tcp_invalid_ratelimit; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; @@ -1236,6 +1237,37 @@ static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, return true; } +/* Return true if we're currently rate-limiting out-of-window ACKs and + * thus shouldn't send a dupack right now. We rate-limit dupacks in + * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS + * attacks that send repeated SYNs or ACKs for the same connection. To + * do this, we do not send a duplicate SYNACK or ACK if the remote + * endpoint is sending out-of-window SYNs or pure ACKs at a high rate. + */ +static inline bool tcp_oow_rate_limited(struct net *net, + const struct sk_buff *skb, + int mib_idx, u32 *last_oow_ack_time) +{ + /* Data packets without SYNs are not likely part of an ACK loop. */ + if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && + !tcp_hdr(skb)->syn) + goto not_rate_limited; + + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS_BH(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + +not_rate_limited: + return false; /* not rate-limited: go ahead, send dupack now! */ +} + static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b22224100011..6a6fb747c78d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -270,6 +270,12 @@ enum LINUX_MIB_TCPHYSTARTTRAINCWND, /* TCPHystartTrainCwnd */ LINUX_MIB_TCPHYSTARTDELAYDETECT, /* TCPHystartDelayDetect */ LINUX_MIB_TCPHYSTARTDELAYCWND, /* TCPHystartDelayCwnd */ + LINUX_MIB_TCPACKSKIPPEDSYNRECV, /* TCPACKSkippedSynRecv */ + LINUX_MIB_TCPACKSKIPPEDPAWS, /* TCPACKSkippedPAWS */ + LINUX_MIB_TCPACKSKIPPEDSEQ, /* TCPACKSkippedSeq */ + LINUX_MIB_TCPACKSKIPPEDFINWAIT2, /* TCPACKSkippedFinWait2 */ + LINUX_MIB_TCPACKSKIPPEDTIMEWAIT, /* TCPACKSkippedTimeWait */ + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8f9cd200ce20..d8953ef0770c 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -292,6 +292,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPHystartTrainCwnd", LINUX_MIB_TCPHYSTARTTRAINCWND), SNMP_MIB_ITEM("TCPHystartDelayDetect", LINUX_MIB_TCPHYSTARTDELAYDETECT), SNMP_MIB_ITEM("TCPHystartDelayCwnd", LINUX_MIB_TCPHYSTARTDELAYCWND), + SNMP_MIB_ITEM("TCPACKSkippedSynRecv", LINUX_MIB_TCPACKSKIPPEDSYNRECV), + SNMP_MIB_ITEM("TCPACKSkippedPAWS", LINUX_MIB_TCPACKSKIPPEDPAWS), + SNMP_MIB_ITEM("TCPACKSkippedSeq", LINUX_MIB_TCPACKSKIPPEDSEQ), + SNMP_MIB_ITEM("TCPACKSkippedFinWait2", LINUX_MIB_TCPACKSKIPPEDFINWAIT2), + SNMP_MIB_ITEM("TCPACKSkippedTimeWait", LINUX_MIB_TCPACKSKIPPEDTIMEWAIT), + SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e0ee384a448f..82601a68cf90 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -728,6 +728,13 @@ static struct ctl_table ipv4_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "tcp_invalid_ratelimit", + .data = &sysctl_tcp_invalid_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, { .procname = "icmp_msgs_per_sec", .data = &sysctl_icmp_msgs_per_sec, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d3dfff78fa19..9401aa43b814 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_thin_dupack __read_mostly; int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_early_retrans __read_mostly = 3; +int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */