From 6fa01ccd883021105e9f8af7d04b9f156fa3494a Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Fri, 22 Apr 2016 18:36:35 -0700 Subject: [PATCH] skbuff: Add pskb_extract() helper function A pattern of skb usage seen in modules such as RDS-TCP is to extract `to_copy' bytes from the received TCP segment, starting at some offset `off' into a new skb `clone'. This is done in the ->data_ready callback, where the clone skb is queued up for rx on the PF_RDS socket, while the parent TCP segment is returned unchanged back to the TCP engine. The existing code uses the sequence clone = skb_clone(..); pskb_pull(clone, off, ..); pskb_trim(clone, to_copy, ..); with the intention of discarding the first `off' bytes. However, skb_clone() + pskb_pull() implies pksb_expand_head(), which ends up doing a redundant memcpy of bytes that will then get discarded in __pskb_pull_tail(). To avoid this inefficiency, this commit adds pskb_extract() that creates the clone, and memcpy's only the relevant header/frag/frag_list to the start of `clone'. pskb_trim() is then invoked to trim clone down to the requested to_copy bytes. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 + net/core/skbuff.c | 242 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 244 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index da0ace389fec..a1ce63979ad8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2986,6 +2986,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb); int skb_ensure_writable(struct sk_buff *skb, int write_len); int skb_vlan_pop(struct sk_buff *skb); int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, + gfp_t gfp); static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7ff7788b0151..7a1d48983f81 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -4622,3 +4622,245 @@ failure: return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + int doff = 0; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + doff = (data - skb->head); + skb->head = data; + skb->data = data; + skb->head_frag = 0; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + doff = 0; +#else + skb->end = skb->head + size; +#endif + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct sk_buff *skb, + struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + int doff = 0; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + shinfo->frags[0].page_offset += off - pos; + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + if (k == 0) { + /* split line is in frag list */ + pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask); + } + skb_release_data(skb); + + doff = (data - skb->head); + skb->head = data; + skb->head_frag = 0; + skb->data = data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; + doff = 0; +#else + skb->end = skb->head + size; +#endif + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract);