linux-stable/include/uapi/linux/tcp.h

244 lines
6.9 KiB
C
Raw Normal View History

/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the TCP protocol.
*
* Version: @(#)tcp.h 1.0.2 04/28/93
*
* Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
#ifndef _UAPI_LINUX_TCP_H
#define _UAPI_LINUX_TCP_H
#include <linux/types.h>
#include <asm/byteorder.h>
#include <linux/socket.h>
struct tcphdr {
__be16 source;
__be16 dest;
__be32 seq;
__be32 ack_seq;
#if defined(__LITTLE_ENDIAN_BITFIELD)
__u16 res1:4,
doff:4,
fin:1,
syn:1,
rst:1,
psh:1,
ack:1,
urg:1,
ece:1,
cwr:1;
#elif defined(__BIG_ENDIAN_BITFIELD)
__u16 doff:4,
res1:4,
cwr:1,
ece:1,
urg:1,
ack:1,
psh:1,
rst:1,
syn:1,
fin:1;
#else
#error "Adjust your <asm/byteorder.h> defines"
#endif
__be16 window;
__sum16 check;
__be16 urg_ptr;
};
/*
* The union cast uses a gcc extension to avoid aliasing problems
* (union is compatible to any of its members)
* This means this part of the code is -fstrict-aliasing safe now.
*/
union tcp_word_hdr {
struct tcphdr hdr;
__be32 words[5];
};
#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3])
enum {
TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000),
TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000),
TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000),
TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000),
TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000),
TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000),
TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000),
TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000),
TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000),
TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000)
};
/*
* TCP general constants
*/
#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */
#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */
/* TCP socket options */
#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */
#define TCP_MAXSEG 2 /* Limit MSS */
#define TCP_CORK 3 /* Never send partially complete segments */
#define TCP_KEEPIDLE 4 /* Start keeplives after this period */
#define TCP_KEEPINTVL 5 /* Interval between keepalives */
#define TCP_KEEPCNT 6 /* Number of keepalives before death */
#define TCP_SYNCNT 7 /* Number of SYN retransmits */
#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */
#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */
#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */
#define TCP_INFO 11 /* Information about this connection. */
#define TCP_QUICKACK 12 /* Block/reenable quick acks */
#define TCP_CONGESTION 13 /* Congestion control algorithm */
#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */
#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/
#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */
#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */
#define TCP_REPAIR 19 /* TCP sock is under repair right now */
#define TCP_REPAIR_QUEUE 20
#define TCP_QUEUE_SEQ 21
#define TCP_REPAIR_OPTIONS 22
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
#define TCP_TIMESTAMP 24
tcp: TCP_NOTSENT_LOWAT socket option Idea of this patch is to add optional limitation of number of unsent bytes in TCP sockets, to reduce usage of kernel memory. TCP receiver might announce a big window, and TCP sender autotuning might allow a large amount of bytes in write queue, but this has little performance impact if a large part of this buffering is wasted : Write queue needs to be large only to deal with large BDP, not necessarily to cope with scheduling delays (incoming ACKS make room for the application to queue more bytes) For most workloads, using a value of 128 KB or less is OK to give applications enough time to react to POLLOUT events in time (or being awaken in a blocking sendmsg()) This patch adds two ways to set the limit : 1) Per socket option TCP_NOTSENT_LOWAT 2) A sysctl (/proc/sys/net/ipv4/tcp_notsent_lowat) for sockets not using TCP_NOTSENT_LOWAT socket option (or setting a zero value) Default value being UINT_MAX (0xFFFFFFFF), meaning this has no effect. This changes poll()/select()/epoll() to report POLLOUT only if number of unsent bytes is below tp->nosent_lowat Note this might increase number of sendmsg()/sendfile() calls when using non blocking sockets, and increase number of context switches for blocking sockets. Note this is not related to SO_SNDLOWAT (as SO_SNDLOWAT is defined as : Specify the minimum number of bytes in the buffer until the socket layer will pass the data to the protocol) Tested: netperf sessions, and watching /proc/net/protocols "memory" column for TCP With 200 concurrent netperf -t TCP_STREAM sessions, amount of kernel memory used by TCP buffers shrinks by ~55 % (20567 pages instead of 45458) lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 45458 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 45458 no 208 yes kernel y y y y y y y y y y y y y n y y y y y lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# (super_netperf 200 -t TCP_STREAM -H remote -l 90 &); sleep 60 ; grep TCP /proc/net/protocols TCPv6 1880 2 20567 no 208 yes ipv6 y y y y y y y y y y y y y n y y y y y TCP 1696 508 20567 no 208 yes kernel y y y y y y y y y y y y y n y y y y y Using 128KB has no bad effect on the throughput or cpu usage of a single flow, although there is an increase of context switches. A bonus is that we hold socket lock for a shorter amount of time and should improve latencies of ACK processing. lpq83:~# echo -1 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1651584 6291456 16384 20.00 17447.90 10^6bits/s 3.13 S -1.00 U 0.353 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 412,514 context-switches 200.034645535 seconds time elapsed lpq83:~# echo 131072 >/proc/sys/net/ipv4/tcp_notsent_lowat lpq83:~# perf stat -e context-switches ./netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3 OMNI Send TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 7.7.7.84 () port 0 AF_INET : +/-2.500% @ 99% conf. Local Remote Local Elapsed Throughput Throughput Local Local Remote Remote Local Remote Service Send Socket Recv Socket Send Time Units CPU CPU CPU CPU Service Service Demand Size Size Size (sec) Util Util Util Util Demand Demand Units Final Final % Method % Method 1593240 6291456 16384 20.00 17321.16 10^6bits/s 3.35 S -1.00 U 0.381 -1.000 usec/KB Performance counter stats for './netperf -H 7.7.7.84 -t omni -l 20 -c -i10,3': 2,675,818 context-switches 200.029651391 seconds time elapsed Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-By: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2013-07-23 03:27:07 +00:00
#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */
#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */
#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */
#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */
net/tcp-fastopen: Add new API support This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an alternative way to perform Fast Open on the active side (client). Prior to this patch, a client needs to replace the connect() call with sendto(MSG_FASTOPEN). This can be cumbersome for applications who want to use Fast Open: these socket operations are often done in lower layer libraries used by many other applications. Changing these libraries and/or the socket call sequences are not trivial. A more convenient approach is to perform Fast Open by simply enabling a socket option when the socket is created w/o changing other socket calls sequence: s = socket() create a new socket setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …); newly introduced sockopt If set, new functionality described below will be used. Return ENOTSUPP if TFO is not supported or not enabled in the kernel. connect() With cookie present, return 0 immediately. With no cookie, initiate 3WHS with TFO cookie-request option and return -1 with errno = EINPROGRESS. write()/sendmsg() With cookie present, send out SYN with data and return the number of bytes buffered. With no cookie, and 3WHS not yet completed, return -1 with errno = EINPROGRESS. No MSG_FASTOPEN flag is needed. read() Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but write() is not called yet. Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is established but no msg is received yet. Return number of bytes read if socket is established and there is msg received. The new API simplifies life for applications that always perform a write() immediately after a successful connect(). Such applications can now take advantage of Fast Open by merely making one new setsockopt() call at the time of creating the socket. Nothing else about the application's socket call sequence needs to change. Signed-off-by: Wei Wang <weiwan@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-23 18:59:22 +00:00
#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */
struct tcp_repair_opt {
__u32 opt_code;
__u32 opt_val;
};
struct tcp_repair_window {
__u32 snd_wl1;
__u32 snd_wnd;
__u32 max_window;
__u32 rcv_wnd;
__u32 rcv_wup;
};
enum {
TCP_NO_QUEUE,
TCP_RECV_QUEUE,
TCP_SEND_QUEUE,
TCP_QUEUES_NR,
};
/* for TCP_INFO socket option */
#define TCPI_OPT_TIMESTAMPS 1
#define TCPI_OPT_SACK 2
#define TCPI_OPT_WSCALE 4
#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */
#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */
#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */
enum tcp_ca_state {
TCP_CA_Open = 0,
#define TCPF_CA_Open (1<<TCP_CA_Open)
TCP_CA_Disorder = 1,
#define TCPF_CA_Disorder (1<<TCP_CA_Disorder)
TCP_CA_CWR = 2,
#define TCPF_CA_CWR (1<<TCP_CA_CWR)
TCP_CA_Recovery = 3,
#define TCPF_CA_Recovery (1<<TCP_CA_Recovery)
TCP_CA_Loss = 4
#define TCPF_CA_Loss (1<<TCP_CA_Loss)
};
struct tcp_info {
__u8 tcpi_state;
__u8 tcpi_ca_state;
__u8 tcpi_retransmits;
__u8 tcpi_probes;
__u8 tcpi_backoff;
__u8 tcpi_options;
__u8 tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
__u8 tcpi_delivery_rate_app_limited:1;
__u32 tcpi_rto;
__u32 tcpi_ato;
__u32 tcpi_snd_mss;
__u32 tcpi_rcv_mss;
__u32 tcpi_unacked;
__u32 tcpi_sacked;
__u32 tcpi_lost;
__u32 tcpi_retrans;
__u32 tcpi_fackets;
/* Times. */
__u32 tcpi_last_data_sent;
__u32 tcpi_last_ack_sent; /* Not remembered, sorry. */
__u32 tcpi_last_data_recv;
__u32 tcpi_last_ack_recv;
/* Metrics. */
__u32 tcpi_pmtu;
__u32 tcpi_rcv_ssthresh;
__u32 tcpi_rtt;
__u32 tcpi_rttvar;
__u32 tcpi_snd_ssthresh;
__u32 tcpi_snd_cwnd;
__u32 tcpi_advmss;
__u32 tcpi_reordering;
__u32 tcpi_rcv_rtt;
__u32 tcpi_rcv_space;
__u32 tcpi_total_retrans;
__u64 tcpi_pacing_rate;
__u64 tcpi_max_pacing_rate;
__u64 tcpi_bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
__u64 tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
__u32 tcpi_segs_out; /* RFC4898 tcpEStatsPerfSegsOut */
__u32 tcpi_segs_in; /* RFC4898 tcpEStatsPerfSegsIn */
__u32 tcpi_notsent_bytes;
__u32 tcpi_min_rtt;
tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In Per RFC4898, they count segments sent/received containing a positive length data segment (that includes retransmission segments carrying data). Unlike tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments carrying no data (e.g. pure ack). The patch also updates the segs_in in tcp_fastopen_add_skb() so that segs_in >= data_segs_in property is kept. Together with retransmission data, tcpi_data_segs_out gives a better signal on the rxmit rate. v6: Rebase on the latest net-next v5: Eric pointed out that checking skb->len is still needed in tcp_fastopen_add_skb() because skb can carry a FIN without data. Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in() helper is used. Comment is added to the fastopen case to explain why segs_in has to be reset and tcp_segs_in() has to be called before __skb_pull(). v4: Add comment to the changes in tcp_fastopen_add_skb() and also add remark on this case in the commit message. v3: Add const modifier to the skb parameter in tcp_segs_in() v2: Rework based on recent fix by Eric: commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment") Signed-off-by: Martin KaFai Lau <kafai@fb.com> Cc: Chris Rapier <rapier@psc.edu> Cc: Eric Dumazet <edumazet@google.com> Cc: Marcelo Ricardo Leitner <mleitner@redhat.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-03-14 17:52:15 +00:00
__u32 tcpi_data_segs_in; /* RFC4898 tcpEStatsDataSegsIn */
__u32 tcpi_data_segs_out; /* RFC4898 tcpEStatsDataSegsOut */
__u64 tcpi_delivery_rate;
__u64 tcpi_busy_time; /* Time (usec) busy sending data */
__u64 tcpi_rwnd_limited; /* Time (usec) limited by receive window */
__u64 tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
};
/* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
enum {
TCP_NLA_PAD,
TCP_NLA_BUSY, /* Time (usec) busy sending data */
TCP_NLA_RWND_LIMITED, /* Time (usec) limited by receive window */
TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */
};
/* for TCP_MD5SIG socket option */
#define TCP_MD5SIG_MAXKEYLEN 80
struct tcp_md5sig {
struct __kernel_sockaddr_storage tcpm_addr; /* address associated */
__u16 __tcpm_pad1; /* zero */
__u16 tcpm_keylen; /* key length */
__u32 __tcpm_pad2; /* zero */
__u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */
};
#endif /* _UAPI_LINUX_TCP_H */