diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index a400288cb37b..6255973e3dda 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -169,10 +169,9 @@ static void loopback_setup(struct net_device *dev) dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; netif_keep_dst(dev); - dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO; + dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_ALL_TSO - | NETIF_F_UFO + | NETIF_F_GSO_SOFTWARE | NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index aa7b2400f98c..9c6c8ef2e9e7 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -53,8 +53,9 @@ enum { * headers in software. */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ + NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_TUNNEL_REMCSUM_BIT, + NETIF_F_GSO_SCTP_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -128,6 +129,7 @@ enum { #define NETIF_F_TSO_MANGLEID __NETIF_F(TSO_MANGLEID) #define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) +#define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) @@ -166,7 +168,8 @@ enum { NETIF_F_FSO) /* List of features with software fallbacks. */ -#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_UFO) +#define NETIF_F_GSO_SOFTWARE (NETIF_F_ALL_TSO | NETIF_F_UFO | \ + NETIF_F_GSO_SCTP) /* * If one device supports one of these features, then enable them diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f45929ce8157..fa6df2699532 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4012,6 +4012,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ee38a4127475..dc0fca747c5e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -301,6 +301,11 @@ struct sk_buff; #endif extern int sysctl_max_skb_frags; +/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to + * segment using its current segmentation instead. + */ +#define GSO_BY_FRAGS 0xFFFF + typedef struct skb_frag_struct skb_frag_t; struct skb_frag_struct { @@ -482,6 +487,8 @@ enum { SKB_GSO_PARTIAL = 1 << 13, SKB_GSO_TUNNEL_REMCSUM = 1 << 14, + + SKB_GSO_SCTP = 1 << 15, }; #if BITS_PER_LONG > 32 @@ -2987,6 +2994,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len); int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen); void skb_scrub_packet(struct sk_buff *skb, bool xnet); unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); +bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index b392ac8382f2..632e205ca54b 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -186,6 +186,10 @@ void sctp_assocs_proc_exit(struct net *net); int sctp_remaddr_proc_init(struct net *net); void sctp_remaddr_proc_exit(struct net *net); +/* + * sctp/offload.c + */ +int sctp_offload_init(void); /* * Module global variables diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 16b013a6191c..83c5ec58b93a 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -566,6 +566,9 @@ struct sctp_chunk { /* This points to the sk_buff containing the actual data. */ struct sk_buff *skb; + /* In case of GSO packets, this will store the head one */ + struct sk_buff *head_skb; + /* These are the SCTP headers by reverse order in a packet. * Note that some of these may happen more than once. In that * case, we point at the "current" one, whatever that means @@ -696,6 +699,8 @@ struct sctp_packet { size_t overhead; /* This is the total size of all chunks INCLUDING padding. */ size_t size; + /* This is the maximum size this packet may have */ + size_t max_size; /* The packet is destined for this transport address. * The function we finally use to pass down to the next lower diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f4034817d255..977489820eb9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -89,6 +89,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", + [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f2b77e549c03..b6e0f95bef36 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #ifdef CONFIG_NET_CLS_ACT #include @@ -3116,9 +3117,13 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, int hsize; int size; - len = head_skb->len - offset; - if (len > mss) - len = mss; + if (unlikely(mss == GSO_BY_FRAGS)) { + len = list_skb->len; + } else { + len = head_skb->len - offset; + if (len > mss) + len = mss; + } hsize = skb_headlen(head_skb) - offset; if (hsize < 0) @@ -3438,6 +3443,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->same_flow = 1; return 0; } +EXPORT_SYMBOL_GPL(skb_gro_receive); void __init skb_init(void) { @@ -4378,6 +4384,8 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); + } else if (unlikely(shinfo->gso_type & SKB_GSO_SCTP)) { + thlen = sizeof(struct sctphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already @@ -4387,6 +4395,37 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) } EXPORT_SYMBOL_GPL(skb_gso_transport_seglen); +/** + * skb_gso_validate_mtu - Return in case such skb fits a given MTU + * + * @skb: GSO skb + * + * skb_gso_validate_mtu validates if a given skb will fit a wanted MTU + * once split. + */ +bool skb_gso_validate_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + const struct skb_shared_info *shinfo = skb_shinfo(skb); + const struct sk_buff *iter; + unsigned int hlen; + + hlen = skb_gso_network_seglen(skb); + + if (shinfo->gso_size != GSO_BY_FRAGS) + return hlen <= mtu; + + /* Undo this so we can re-use header sizes */ + hlen -= GSO_BY_FRAGS; + + skb_walk_frags(skb, iter) { + if (hlen + skb_headlen(iter) > mtu) + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(skb_gso_validate_mtu); + static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb) { if (skb_cow(skb, skb_headroom(skb)) < 0) { diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index cbfb1808fcc4..9f0a7b96646f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -54,7 +54,7 @@ static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 124bf0a66328..cbac493c913a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -225,7 +225,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, /* common case: locally created skb or seglen is <= mtu */ if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) || - skb_gso_network_seglen(skb) <= mtu) + skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); /* Slowpath - GSO segment length is exceeding the dst MTU. diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cbf127ae7c67..6b2f60a5c1de 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -368,7 +368,7 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->ignore_df) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0b80a7140cc4..7a4aa3450dd7 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -91,7 +91,7 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) if (skb->len <= mtu) return false; - if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) return false; return true; diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 0fca5824ad0e..6c4f7496cec6 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -11,7 +11,8 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ transport.o chunk.o sm_make_chunk.o ulpevent.o \ inqueue.o outqueue.o ulpqueue.o \ tsnmap.o bind_addr.o socket.o primitive.o \ - output.o input.o debug.o ssnmap.o auth.o + output.o input.o debug.o ssnmap.o auth.o \ + offload.o sctp_probe-y := probe.o diff --git a/net/sctp/input.c b/net/sctp/input.c index a701527a9480..6f8e676d285e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; - struct sctphdr *sh; union sctp_addr src; union sctp_addr dest; int family; @@ -124,28 +123,29 @@ int sctp_rcv(struct sk_buff *skb) __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); - if (skb_linearize(skb)) + /* If packet is too small to contain a single chunk, let's not + * waste time on it anymore. + */ + if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + + skb_transport_offset(skb)) goto discard_it; - sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(struct sctphdr))) + goto discard_it; - /* Pull up the IP and SCTP headers. */ + /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); - if (skb->len < sizeof(struct sctphdr)) - goto discard_it; skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); - else if (!sctp_checksum_disable && sctp_rcv_checksum(net, skb) < 0) + else if (!sctp_checksum_disable && + !(skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) && + sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; - skb_pull(skb, sizeof(struct sctphdr)); - - /* Make sure we at least have chunk headers worth of data left. */ - if (skb->len < sizeof(struct sctp_chunkhdr)) - goto discard_it; + __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); @@ -230,7 +230,7 @@ int sctp_rcv(struct sk_buff *skb) chunk->rcvr = rcvr; /* Remember the SCTP header. */ - chunk->sctp_hdr = sh; + chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); @@ -660,19 +660,23 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) */ static int sctp_rcv_ootb(struct sk_buff *skb) { - sctp_chunkhdr_t *ch; - __u8 *ch_end; - - ch = (sctp_chunkhdr_t *) skb->data; + sctp_chunkhdr_t *ch, _ch; + int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { + /* Make sure we have at least the header there */ + if (offset + sizeof(sctp_chunkhdr_t) > skb->len) + break; + + ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); + /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) break; - ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - if (ch_end > skb_tail_pointer(skb)) + ch_end = offset + WORD_ROUND(ntohs(ch->length)); + if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the @@ -697,8 +701,8 @@ static int sctp_rcv_ootb(struct sk_buff *skb) if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; - ch = (sctp_chunkhdr_t *) ch_end; - } while (ch_end < skb_tail_pointer(skb)); + offset = ch_end; + } while (ch_end < skb->len); return 0; @@ -1173,6 +1177,17 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, { sctp_chunkhdr_t *ch; + /* We do not allow GSO frames here as we need to linearize and + * then cannot guarantee frame boundaries. This shouldn't be an + * issue as packets hitting this are mostly INIT or INIT-ACK and + * those cannot be on GSO-style anyway. + */ + if ((skb_shinfo(skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) + return NULL; + + if (skb_linearize(skb)) + return NULL; + ch = (sctp_chunkhdr_t *) skb->data; /* The code below will attempt to walk the chunk and extract diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 9d87bba0ff1d..edabbbdfca54 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -130,13 +130,25 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) * at this time. */ - if ((chunk = queue->in_progress)) { + chunk = queue->in_progress; + if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { + if (chunk->head_skb == chunk->skb) { + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + goto new_skb; + } + if (chunk->skb->next) { + chunk->skb = chunk->skb->next; + goto new_skb; + } + + if (chunk->head_skb) + chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { @@ -152,34 +164,64 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) if (!chunk) { struct list_head *entry; +next_chunk: /* Is the queue empty? */ - if (list_empty(&queue->in_chunk_list)) + entry = sctp_list_dequeue(&queue->in_chunk_list); + if (!entry) return NULL; - entry = queue->in_chunk_list.next; - chunk = queue->in_progress = - list_entry(entry, struct sctp_chunk, list); - list_del_init(entry); + chunk = list_entry(entry, struct sctp_chunk, list); - /* This is the first chunk in the packet. */ - chunk->singleton = 1; - ch = (sctp_chunkhdr_t *) chunk->skb->data; - chunk->data_accepted = 0; + /* Linearize if it's not GSO */ + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) != SKB_GSO_SCTP && + skb_is_nonlinear(chunk->skb)) { + if (skb_linearize(chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + + /* Update sctp_hdr as it probably changed */ + chunk->sctp_hdr = sctp_hdr(chunk->skb); + } + + if ((skb_shinfo(chunk->skb)->gso_type & SKB_GSO_SCTP) == SKB_GSO_SCTP) { + /* GSO-marked skbs but without frags, handle + * them normally + */ + if (skb_shinfo(chunk->skb)->frag_list) + chunk->head_skb = chunk->skb; + + /* skbs with "cover letter" */ + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + chunk->skb = skb_shinfo(chunk->skb)->frag_list; + + if (WARN_ON(!chunk->skb)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } + } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); + + queue->in_progress = chunk; + +new_skb: + /* This is the first chunk in the packet. */ + ch = (sctp_chunkhdr_t *) chunk->skb->data; + chunk->singleton = 1; + chunk->data_accepted = 0; + chunk->pdiscard = 0; + chunk->auth = 0; + chunk->has_asconf = 0; + chunk->end_of_packet = 0; + chunk->ecn_ce_done = 0; } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); - /* In the unlikely case of an IP reassembly, the skb could be - * non-linear. If so, update chunk_end so that it doesn't go past - * the skb->tail. - */ - if (unlikely(skb_is_nonlinear(chunk->skb))) { - if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) - chunk->chunk_end = skb_tail_pointer(chunk->skb); - } skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ diff --git a/net/sctp/offload.c b/net/sctp/offload.c new file mode 100644 index 000000000000..a37887b373a7 --- /dev/null +++ b/net/sctp/offload.c @@ -0,0 +1,98 @@ +/* + * sctp_offload - GRO/GSO Offloading for SCTP + * + * Copyright (C) 2015, Marcelo Ricardo Leitner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static __le32 sctp_gso_make_checksum(struct sk_buff *skb) +{ + skb->ip_summed = CHECKSUM_NONE; + return sctp_compute_cksum(skb, skb_transport_offset(skb)); +} + +static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct sctphdr *sh; + + sh = sctp_hdr(skb); + if (!pskb_may_pull(skb, sizeof(*sh))) + goto out; + + __skb_pull(skb, sizeof(*sh)); + + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + struct skb_shared_info *pinfo = skb_shinfo(skb); + struct sk_buff *frag_iter; + + pinfo->gso_segs = 0; + if (skb->len != skb->data_len) { + /* Means we have chunks in here too */ + pinfo->gso_segs++; + } + + skb_walk_frags(skb, frag_iter) + pinfo->gso_segs++; + + segs = NULL; + goto out; + } + + segs = skb_segment(skb, features | NETIF_F_HW_CSUM); + if (IS_ERR(segs)) + goto out; + + /* All that is left is update SCTP CRC if necessary */ + if (!(features & NETIF_F_SCTP_CRC)) { + for (skb = segs; skb; skb = skb->next) { + if (skb->ip_summed == CHECKSUM_PARTIAL) { + sh = sctp_hdr(skb); + sh->checksum = sctp_gso_make_checksum(skb); + } + } + } + +out: + return segs; +} + +static const struct net_offload sctp_offload = { + .callbacks = { + .gso_segment = sctp_gso_segment, + }, +}; + +int __init sctp_offload_init(void) +{ + return inet_add_offload(&sctp_offload, IPPROTO_SCTP); +} diff --git a/net/sctp/output.c b/net/sctp/output.c index 9844fe573029..90d2e125c2f5 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -84,18 +84,42 @@ static void sctp_packet_reset(struct sctp_packet *packet) struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, __u32 vtag, int ecn_capable) { - struct sctp_chunk *chunk = NULL; + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; pr_debug("%s: packet:%p vtag:0x%x\n", __func__, packet, vtag); packet->vtag = vtag; + if (asoc && tp->dst) { + struct sock *sk = asoc->base.sk; + + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + + if (sk_can_gso(sk)) { + struct net_device *dev = tp->dst->dev; + + packet->max_size = dev->gso_max_size; + } else { + packet->max_size = asoc->pathmtu; + } + rcu_read_unlock(); + + } else { + packet->max_size = tp->pathmtu; + } + if (ecn_capable && sctp_packet_empty(packet)) { - chunk = sctp_get_ecne_prepend(packet->transport->asoc); + struct sctp_chunk *chunk; /* If there a is a prepend chunk stick it on the list before * any other chunks get appended. */ + chunk = sctp_get_ecne_prepend(asoc); if (chunk) sctp_packet_append_chunk(packet, chunk); } @@ -158,7 +182,8 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, sctp_xmit_t retval; int error = 0; - pr_debug("%s: packet:%p chunk:%p\n", __func__, packet, chunk); + pr_debug("%s: packet:%p size:%lu chunk:%p size:%d\n", __func__, + packet, packet->size, chunk, chunk->skb ? chunk->skb->len : -1); switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { case SCTP_XMIT_PMTU_FULL: @@ -381,12 +406,15 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - struct sk_buff *nskb; + struct sk_buff *nskb = NULL, *head = NULL; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ + int pkt_size; __u8 has_data = 0; + int gso = 0; + int pktcount = 0; struct dst_entry *dst; unsigned char *auth = NULL; /* pointer to auth in skb data */ @@ -400,18 +428,37 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* Allocate the new skb. */ - nskb = alloc_skb(packet->size + MAX_HEADER, gfp); - if (!nskb) + /* Allocate the head skb, or main one if not in GSO */ + if (packet->size > tp->pathmtu && !packet->ipfragok) { + if (sk_can_gso(sk)) { + gso = 1; + pkt_size = packet->overhead; + } else { + /* If this happens, we trash this packet and try + * to build a new one, hopefully correct this + * time. Application may notice this error. + */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto nomem; + } + } else { + pkt_size = packet->size; + } + head = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!head) goto nomem; + if (gso) { + NAPI_GRO_CB(head)->last = head; + skb_shinfo(head)->gso_type = sk->sk_gso_type; + } /* Make sure the outbound skb has enough header room reserved. */ - skb_reserve(nskb, packet->overhead + MAX_HEADER); + skb_reserve(head, packet->overhead + MAX_HEADER); /* Set the owning socket so that we know where to get the * destination IP address. */ - sctp_packet_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(head, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); @@ -422,11 +469,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) dst = dst_clone(tp->dst); if (!dst) goto no_route; - skb_dst_set(nskb, dst); + skb_dst_set(head, dst); /* Build the SCTP header. */ - sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); - skb_reset_transport_header(nskb); + sh = (struct sctphdr *)skb_push(head, sizeof(struct sctphdr)); + skb_reset_transport_header(head); sh->source = htons(packet->source_port); sh->dest = htons(packet->destination_port); @@ -441,90 +488,133 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) sh->vtag = htonl(packet->vtag); sh->checksum = 0; - /** - * 6.10 Bundling - * - * An endpoint bundles chunks by simply including multiple - * chunks in one outbound SCTP packet. ... - */ - - /** - * 3.2 Chunk Field Descriptions - * - * The total length of a chunk (including Type, Length and - * Value fields) MUST be a multiple of 4 bytes. If the length - * of the chunk is not a multiple of 4 bytes, the sender MUST - * pad the chunk with all zero bytes and this padding is not - * included in the chunk length field. The sender should - * never pad with more than 3 bytes. - * - * [This whole comment explains WORD_ROUND() below.] - */ - pr_debug("***sctp_transmit_packet***\n"); - list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { - list_del_init(&chunk->list); - if (sctp_chunk_is_data(chunk)) { - /* 6.3.1 C4) When data is in flight and when allowed - * by rule C5, a new RTT measurement MUST be made each - * round trip. Furthermore, new RTT measurements - * SHOULD be made no more than once per round-trip - * for a given destination transport address. - */ + do { + /* Set up convenience variables... */ + chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); + pktcount++; - if (!chunk->resent && !tp->rto_pending) { - chunk->rtt_in_progress = 1; - tp->rto_pending = 1; + /* Calculate packet size, so it fits in PMTU. Leave + * other chunks for the next packets. + */ + if (gso) { + pkt_size = packet->overhead; + list_for_each_entry(chunk, &packet->chunk_list, list) { + int padded = WORD_ROUND(chunk->skb->len); + + if (pkt_size + padded > tp->pathmtu) + break; + pkt_size += padded; } - has_data = 1; + /* Allocate a new skb. */ + nskb = alloc_skb(pkt_size + MAX_HEADER, gfp); + if (!nskb) + goto nomem; + + /* Make sure the outbound skb has enough header + * room reserved. + */ + skb_reserve(nskb, packet->overhead + MAX_HEADER); + } else { + nskb = head; } - padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; - if (padding) - memset(skb_put(chunk->skb, padding), 0, padding); - - /* if this is the auth chunk that we are adding, - * store pointer where it will be added and put - * the auth into the packet. + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] */ - if (chunk == packet->auth) - auth = skb_tail_pointer(nskb); - memcpy(skb_put(nskb, chunk->skb->len), + pkt_size -= packet->overhead; + list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { + list_del_init(&chunk->list); + if (sctp_chunk_is_data(chunk)) { + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!chunk->resent && !tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + /* if this is the auth chunk that we are adding, + * store pointer where it will be added and put + * the auth into the packet. + */ + if (chunk == packet->auth) + auth = skb_tail_pointer(nskb); + + memcpy(skb_put(nskb, chunk->skb->len), chunk->skb->data, chunk->skb->len); - pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, " - "rtt_in_progress:%d\n", chunk, - sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), - chunk->has_tsn ? "TSN" : "No TSN", - chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, - ntohs(chunk->chunk_hdr->length), chunk->skb->len, - chunk->rtt_in_progress); + pr_debug("*** Chunk:%p[%s] %s 0x%x, length:%d, chunk->skb->len:%d, rtt_in_progress:%d\n", + chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? ntohl(chunk->subh.data_hdr->tsn) : 0, + ntohs(chunk->chunk_hdr->length), chunk->skb->len, + chunk->rtt_in_progress); - /* - * If this is a control chunk, this is our last - * reference. Free data chunks after they've been - * acknowledged or have failed. + /* If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + * Re-queue auth chunks if needed. + */ + pkt_size -= WORD_ROUND(chunk->skb->len); + + if (chunk == packet->auth && !list_empty(&packet->chunk_list)) + list_add(&chunk->list, &packet->chunk_list); + else if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + + if (!pkt_size) + break; + } + + /* SCTP-AUTH, Section 6.2 + * The sender MUST calculate the MAC as described in RFC2104 [2] + * using the hash function H as described by the MAC Identifier and + * the shared association key K based on the endpoint pair shared key + * described by the shared key identifier. The 'data' used for the + * computation of the AUTH-chunk is given by the AUTH chunk with its + * HMAC field set to zero (as shown in Figure 6) followed by all + * chunks that are placed after the AUTH chunk in the SCTP packet. */ - if (!sctp_chunk_is_data(chunk)) - sctp_chunk_free(chunk); - } + if (auth) + sctp_auth_calculate_hmac(asoc, nskb, + (struct sctp_auth_chunk *)auth, + gfp); - /* SCTP-AUTH, Section 6.2 - * The sender MUST calculate the MAC as described in RFC2104 [2] - * using the hash function H as described by the MAC Identifier and - * the shared association key K based on the endpoint pair shared key - * described by the shared key identifier. The 'data' used for the - * computation of the AUTH-chunk is given by the AUTH chunk with its - * HMAC field set to zero (as shown in Figure 6) followed by all - * chunks that are placed after the AUTH chunk in the SCTP packet. - */ - if (auth) - sctp_auth_calculate_hmac(asoc, nskb, - (struct sctp_auth_chunk *)auth, - gfp); + if (!gso) + break; + + if (skb_gro_receive(&head, nskb)) + goto nomem; + nskb = NULL; + if (WARN_ON_ONCE(skb_shinfo(head)->gso_segs >= + sk->sk_gso_max_segs)) + goto nomem; + } while (!list_empty(&packet->chunk_list)); /* 2) Calculate the Adler-32 checksum of the whole packet, * including the SCTP common header and all the @@ -532,16 +622,18 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . + * + * If it's a GSO packet, it's postponed to sctp_skb_segment. */ - if (!sctp_checksum_disable) { - if (!(dst->dev->features & NETIF_F_SCTP_CRC) || - (dst_xfrm(dst) != NULL) || packet->ipfragok) { - sh->checksum = sctp_compute_cksum(nskb, 0); + if (!sctp_checksum_disable || gso) { + if (!gso && (!(dst->dev->features & NETIF_F_SCTP_CRC) || + dst_xfrm(dst) || packet->ipfragok)) { + sh->checksum = sctp_compute_cksum(head, 0); } else { /* no need to seed pseudo checksum for SCTP */ - nskb->ip_summed = CHECKSUM_PARTIAL; - nskb->csum_start = skb_transport_header(nskb) - nskb->head; - nskb->csum_offset = offsetof(struct sctphdr, checksum); + head->ip_summed = CHECKSUM_PARTIAL; + head->csum_start = skb_transport_header(head) - head->head; + head->csum_offset = offsetof(struct sctphdr, checksum); } } @@ -557,7 +649,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - tp->af_specific->ecn_capable(nskb->sk); + tp->af_specific->ecn_capable(sk); /* Set up the IP options. */ /* BUG: not implemented @@ -566,7 +658,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) /* Dump that on IP! */ if (asoc) { - asoc->stats.opackets++; + asoc->stats.opackets += pktcount; if (asoc->peer.last_sent_to != tp) /* Considering the multiple CPU scenario, this is a * "correcter" place for last_sent_to. --xguo @@ -589,16 +681,36 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) } } - pr_debug("***sctp_transmit_packet*** skb->len:%d\n", nskb->len); + pr_debug("***sctp_transmit_packet*** skb->len:%d\n", head->len); - nskb->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(nskb, tp); + if (gso) { + /* Cleanup our debris for IP stacks */ + memset(head->cb, 0, max(sizeof(struct inet_skb_parm), + sizeof(struct inet6_skb_parm))); + + skb_shinfo(head)->gso_segs = pktcount; + skb_shinfo(head)->gso_size = GSO_BY_FRAGS; + + /* We have to refresh this in case we are xmiting to + * more than one transport at a time + */ + rcu_read_lock(); + if (__sk_dst_get(sk) != tp->dst) { + dst_hold(tp->dst); + sk_setup_caps(sk, tp->dst); + } + rcu_read_unlock(); + } + head->ignore_df = packet->ipfragok; + tp->af_specific->sctp_xmit(head, tp); out: sctp_packet_reset(packet); return err; no_route: - kfree_skb(nskb); + kfree_skb(head); + if (nskb != head) + kfree_skb(nskb); if (asoc) IP_INC_STATS(sock_net(asoc->base.sk), IPSTATS_MIB_OUTNOROUTES); @@ -751,39 +863,63 @@ static sctp_xmit_t sctp_packet_will_fit(struct sctp_packet *packet, struct sctp_chunk *chunk, u16 chunk_len) { - size_t psize; - size_t pmtu; - int too_big; + size_t psize, pmtu; sctp_xmit_t retval = SCTP_XMIT_OK; psize = packet->size; - pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pathmtu) : - (packet->transport->pathmtu)); - - too_big = (psize + chunk_len > pmtu); + if (packet->transport->asoc) + pmtu = packet->transport->asoc->pathmtu; + else + pmtu = packet->transport->pathmtu; /* Decide if we need to fragment or resubmit later. */ - if (too_big) { - /* It's OK to fragmet at IP level if any one of the following + if (psize + chunk_len > pmtu) { + /* It's OK to fragment at IP level if any one of the following * is true: - * 1. The packet is empty (meaning this chunk is greater - * the MTU) - * 2. The chunk we are adding is a control chunk - * 3. The packet doesn't have any data in it yet and data - * requires authentication. + * 1. The packet is empty (meaning this chunk is greater + * the MTU) + * 2. The packet doesn't have any data in it yet and data + * requires authentication. */ - if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk) || + if (sctp_packet_empty(packet) || (!packet->has_data && chunk->auth)) { /* We no longer do re-fragmentation. * Just fragment at the IP layer, if we * actually hit this condition */ packet->ipfragok = 1; - } else { - retval = SCTP_XMIT_PMTU_FULL; + goto out; } + + /* It is also okay to fragment if the chunk we are + * adding is a control chunk, but only if current packet + * is not a GSO one otherwise it causes fragmentation of + * a large frame. So in this case we allow the + * fragmentation by forcing it to be in a new packet. + */ + if (!sctp_chunk_is_data(chunk) && packet->has_data) + retval = SCTP_XMIT_PMTU_FULL; + + if (psize + chunk_len > packet->max_size) + /* Hit GSO/PMTU limit, gotta flush */ + retval = SCTP_XMIT_PMTU_FULL; + + if (!packet->transport->burst_limited && + psize + chunk_len > (packet->transport->cwnd >> 1)) + /* Do not allow a single GSO packet to use more + * than half of cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + + if (packet->transport->burst_limited && + psize + chunk_len > (packet->transport->burst_limited >> 1)) + /* Do not allow a single GSO packet to use more + * than half of original cwnd. + */ + retval = SCTP_XMIT_PMTU_FULL; + /* Otherwise it will fit in the GSO packet */ } +out: return retval; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d3d50daa248b..40022ee885d7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1516,6 +1516,9 @@ static __init int sctp_init(void) if (status) goto err_v6_add_protocol; + if (sctp_offload_init() < 0) + pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); + out: return status; err_v6_add_protocol: diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 67154b848aa9..712fb2339baa 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4003,6 +4003,8 @@ static int sctp_init_sock(struct sock *sk) return -ESOCKTNOSUPPORT; } + sk->sk_gso_type = SKB_GSO_SCTP; + /* Initialize default send parameters. These parameters can be * modified with the SCTP_DEFAULT_SEND_PARAM socket option. */