From 5e040d4b1a440b832c7b4cf8116eebcdff91909c Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 6 Aug 2019 14:53:20 +0100 Subject: [PATCH 1/3] sfc: don't score irq moderation points for GRO We already scored points when handling the RX event, no-one else does this, and looking at the history it appears this was originally meant to only score on merges, not on GRO_NORMAL. Moreover, it gets in the way of changing GRO to not immediately pass GRO_NORMAL skbs to the stack. Performance testing with four TCP streams received on a single CPU (where throughput was line rate of 9.4Gbps in all tests) showed a 13.7% reduction in RX CPU usage (n=6, p=0.03). Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/rx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index d5db045535d3..85ec07f5a674 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -412,7 +412,6 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, unsigned int n_frags, u8 *eh) { struct napi_struct *napi = &channel->napi_str; - gro_result_t gro_result; struct efx_nic *efx = channel->efx; struct sk_buff *skb; @@ -449,9 +448,7 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf, skb_record_rx_queue(skb, channel->rx_queue.core_index); - gro_result = napi_gro_frags(napi); - if (gro_result != GRO_DROP) - channel->irq_mod_score += 2; + napi_gro_frags(napi); } /* Allocate and construct an SKB around page fragments */ From 67270136949e1d55e1a614b0a2e053b7762384ef Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 6 Aug 2019 14:53:31 +0100 Subject: [PATCH 2/3] sfc: falcon: don't score irq moderation points for GRO Same rationale as for sfc, except that this wasn't performance-tested. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/sfc/falcon/rx.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c index fd850d3d8ec0..05ea3523890a 100644 --- a/drivers/net/ethernet/sfc/falcon/rx.c +++ b/drivers/net/ethernet/sfc/falcon/rx.c @@ -424,7 +424,6 @@ ef4_rx_packet_gro(struct ef4_channel *channel, struct ef4_rx_buffer *rx_buf, unsigned int n_frags, u8 *eh) { struct napi_struct *napi = &channel->napi_str; - gro_result_t gro_result; struct ef4_nic *efx = channel->efx; struct sk_buff *skb; @@ -460,9 +459,7 @@ ef4_rx_packet_gro(struct ef4_channel *channel, struct ef4_rx_buffer *rx_buf, skb_record_rx_queue(skb, channel->rx_queue.core_index); - gro_result = napi_gro_frags(napi); - if (gro_result != GRO_DROP) - channel->irq_mod_score += 2; + napi_gro_frags(napi); } /* Allocate and construct an SKB around page fragments */ From 323ebb61e32b478e2432c5a3cbf9e2ca678a9609 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 6 Aug 2019 14:53:55 +0100 Subject: [PATCH 3/3] net: use listified RX for handling GRO_NORMAL skbs When GRO decides not to coalesce a packet, in napi_frags_finish(), instead of passing it to the stack immediately, place it on a list in the napi struct. Then, at flush time (napi_complete_done(), napi_poll(), or napi_busy_loop()), call netif_receive_skb_list_internal() on the list. We'd like to do that in napi_gro_flush(), but it's not called if !napi->gro_bitmask, so we have to do it in the callers instead. (There are a handful of drivers that call napi_gro_flush() themselves, but it's not clear why, or whether this will affect them.) Because a full 64 packets is an inefficiently large batch, also consume the list whenever it exceeds gro_normal_batch, a new net/core sysctl that defaults to 8. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- include/linux/netdevice.h | 3 +++ net/core/dev.c | 44 +++++++++++++++++++++++++++++++++++--- net/core/sysctl_net_core.c | 8 +++++++ 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 88292953aa6f..55ac223553f8 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -332,6 +332,8 @@ struct napi_struct { struct net_device *dev; struct gro_list gro_hash[GRO_HASH_BUCKETS]; struct sk_buff *skb; + struct list_head rx_list; /* Pending GRO_NORMAL skbs */ + int rx_count; /* length of rx_list */ struct hrtimer timer; struct list_head dev_list; struct hlist_node napi_hash_node; @@ -4239,6 +4241,7 @@ extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; extern int dev_rx_weight; extern int dev_tx_weight; +extern int gro_normal_batch; bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev); struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, diff --git a/net/core/dev.c b/net/core/dev.c index af071b0ce88e..49589ed2018d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3963,6 +3963,8 @@ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ int dev_rx_weight __read_mostly = 64; int dev_tx_weight __read_mostly = 64; +/* Maximum number of GRO_NORMAL skbs to batch up for list-RX */ +int gro_normal_batch __read_mostly = 8; /* Called with irq disabled */ static inline void ____napi_schedule(struct softnet_data *sd, @@ -5747,6 +5749,26 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi) } EXPORT_SYMBOL(napi_get_frags); +/* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ +static void gro_normal_list(struct napi_struct *napi) +{ + if (!napi->rx_count) + return; + netif_receive_skb_list_internal(&napi->rx_list); + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; +} + +/* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, + * pass the whole batch up to the stack. + */ +static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb) +{ + list_add_tail(&skb->list, &napi->rx_list); + if (++napi->rx_count >= gro_normal_batch) + gro_normal_list(napi); +} + static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, gro_result_t ret) @@ -5756,8 +5778,8 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, case GRO_HELD: __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); - if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) - ret = GRO_DROP; + if (ret == GRO_NORMAL) + gro_normal_one(napi, skb); break; case GRO_DROP: @@ -6034,6 +6056,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; + gro_normal_list(n); + if (n->gro_bitmask) { unsigned long timeout = 0; @@ -6119,10 +6143,19 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) * Ideally, a new ndo_busy_poll_stop() could avoid another round. */ rc = napi->poll(napi, BUSY_POLL_BUDGET); + /* We can't gro_normal_list() here, because napi->poll() might have + * rearmed the napi (napi_complete_done()) in which case it could + * already be running on another CPU. + */ trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); netpoll_poll_unlock(have_poll_lock); - if (rc == BUSY_POLL_BUDGET) + if (rc == BUSY_POLL_BUDGET) { + /* As the whole budget was spent, we still own the napi so can + * safely handle the rx_list. + */ + gro_normal_list(napi); __napi_schedule(napi); + } local_bh_enable(); } @@ -6167,6 +6200,7 @@ restart: } work = napi_poll(napi, BUSY_POLL_BUDGET); trace_napi_poll(napi, work, BUSY_POLL_BUDGET); + gro_normal_list(napi); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6272,6 +6306,8 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, napi->timer.function = napi_watchdog; init_gro_hash(napi); napi->skb = NULL; + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -6368,6 +6404,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) goto out_unlock; } + gro_normal_list(n); + if (n->gro_bitmask) { /* flush too old packets * If HZ < 1000, flush all packets. diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8da5b3a54dac..eb29e5adc84d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -567,6 +567,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_do_static_key, }, + { + .procname = "gro_normal_batch", + .data = &gro_normal_batch, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + }, { } };