From 7bd175928280c3e5d741cb9948cffaa61b7cc7c6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:25:26 -0800 Subject: [PATCH 01/13] igb: Add support for DMA_ATTR_WEAK_ORDERING Since we are already using DMA attributes in igb for Rx there is no reason why we can't also apply DMA_ATTR_WEAK_ORDERING which is needed on some platforms to improve performance. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 3 +++ drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index acbc3abe2ddd..87c9fe9d6f18 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -148,6 +148,9 @@ struct vf_data_storage { /* How many Rx Buffers do we bundle into one write to the hardware ? */ #define IGB_RX_BUFFER_WRITE 16 /* Must be power of 2 */ +#define IGB_RX_DMA_ATTR \ + (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + #define AUTO_ALL_MODES 0 #define IGB_EEPROM_APME 0x0400 diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index be456bae8169..cf7ee9cdac6f 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3963,7 +3963,7 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) buffer_info->dma, PAGE_SIZE, DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); + IGB_RX_DMA_ATTR); __page_frag_cache_drain(buffer_info->page, buffer_info->pagecnt_bias); @@ -6990,7 +6990,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, */ dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); + IGB_RX_DMA_ATTR); __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } @@ -7250,7 +7250,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, /* map page for use */ dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, - DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); + DMA_FROM_DEVICE, IGB_RX_DMA_ATTR); /* if mapping failed free memory back to system since * there isn't much point in holding memory we can't use From 7ec0116c9131a8cd58dc456ae2bd5bc9976460d1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:25:41 -0800 Subject: [PATCH 02/13] igb: Use length to determine if descriptor is done This change makes it so that we use the length of the packet instead of the DD status bit to determine if a new descriptor is ready to be processed. The obvious advantage is that it cuts down on reads as we don't really even need the DD bit if going from a 0 to a non-zero value on size is enough to inform us that the packet has been completed. In addition I have updated the code so that we only reset the Rx descriptor length for descriptor zero when resetting a ring instead of having to do a memset with 0 over the entire ring. By doing this we can save some time on initialization. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_ethtool.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 737b664d004c..3f5f7744c90f 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -1811,7 +1811,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring, tx_ntc = tx_ring->next_to_clean; rx_desc = IGB_RX_DESC(rx_ring, rx_ntc); - while (igb_test_staterr(rx_desc, E1000_RXD_STAT_DD)) { + while (rx_desc->wb.upper.length) { /* check Rx buffer */ rx_buffer_info = &rx_ring->rx_buffer_info[rx_ntc]; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index cf7ee9cdac6f..1d76d3a90a17 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3720,6 +3720,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, struct igb_ring *ring) { struct e1000_hw *hw = &adapter->hw; + union e1000_adv_rx_desc *rx_desc; u64 rdba = ring->dma; int reg_idx = ring->reg_idx; u32 srrctl = 0, rxdctl = 0; @@ -3758,6 +3759,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; + /* initialize Rx descriptor 0 */ + rx_desc = IGB_RX_DESC(ring, 0); + rx_desc->wb.upper.length = 0; + /* enable receive descriptor fetching */ rxdctl |= E1000_RXDCTL_QUEUE_ENABLE; wr32(E1000_RXDCTL(reg_idx), rxdctl); @@ -3973,9 +3978,6 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) size = sizeof(struct igb_rx_buffer) * rx_ring->count; memset(rx_ring->rx_buffer_info, 0, size); - /* Zero out the descriptor ring */ - memset(rx_ring->desc, 0, rx_ring->size); - rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; @@ -7172,7 +7174,7 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) rx_desc = IGB_RX_DESC(rx_ring, rx_ring->next_to_clean); - if (!rx_desc->wb.upper.status_error) + if (!rx_desc->wb.upper.length) break; /* This memory barrier is needed to keep us from reading @@ -7312,8 +7314,8 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) i -= rx_ring->count; } - /* clear the status bits for the next_to_use descriptor */ - rx_desc->wb.upper.status_error = 0; + /* clear the length for the next_to_use descriptor */ + rx_desc->wb.upper.length = 0; cleaned_count--; } while (cleaned_count); From d2bead576e67c34fe9ea174bb245254d0fe237b5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:25:50 -0800 Subject: [PATCH 03/13] igb: Clear Rx buffer_info in configure instead of clean This change makes it so that instead of going through the entire ring on Rx cleanup we only go through the region that was designated to be cleaned up and stop when we reach the region where new allocations should start. In addition we can avoid having to perform a memset on the Rx buffer_info structures until we are about to start using the ring again. By deferring this we can avoid dirtying the cache any more than we have to which can help to improve the time needed to bring the interface down and then back up again in a reset or suspend/resume cycle. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 24 ++++++++++------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 1d76d3a90a17..680f0d3d9b72 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3435,7 +3435,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring) size = sizeof(struct igb_rx_buffer) * rx_ring->count; - rx_ring->rx_buffer_info = vzalloc(size); + rx_ring->rx_buffer_info = vmalloc(size); if (!rx_ring->rx_buffer_info) goto err; @@ -3759,6 +3759,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, rxdctl |= IGB_RX_HTHRESH << 8; rxdctl |= IGB_RX_WTHRESH << 16; + /* initialize rx_buffer_info */ + memset(ring->rx_buffer_info, 0, + sizeof(struct igb_rx_buffer) * ring->count); + /* initialize Rx descriptor 0 */ rx_desc = IGB_RX_DESC(ring, 0); rx_desc->wb.upper.length = 0; @@ -3937,23 +3941,16 @@ static void igb_free_all_rx_resources(struct igb_adapter *adapter) **/ static void igb_clean_rx_ring(struct igb_ring *rx_ring) { - unsigned long size; - u16 i; + u16 i = rx_ring->next_to_clean; if (rx_ring->skb) dev_kfree_skb(rx_ring->skb); rx_ring->skb = NULL; - if (!rx_ring->rx_buffer_info) - return; - /* Free all the Rx ring sk_buffs */ - for (i = 0; i < rx_ring->count; i++) { + while (i != rx_ring->next_to_alloc) { struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i]; - if (!buffer_info->page) - continue; - /* Invalidate cache lines that may have been written to by * device so that we avoid corrupting memory. */ @@ -3972,12 +3969,11 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) __page_frag_cache_drain(buffer_info->page, buffer_info->pagecnt_bias); - buffer_info->page = NULL; + i++; + if (i == rx_ring->count) + i = 0; } - size = sizeof(struct igb_rx_buffer) * rx_ring->count; - memset(rx_ring->rx_buffer_info, 0, size); - rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; From 7cc6fd4c60f267e17b0baef1580d7a6258c0a6f0 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:26:02 -0800 Subject: [PATCH 04/13] igb: Don't bother clearing Tx buffer_info in igb_clean_tx_ring In the case of the Tx rings we need to only clear the Tx buffer_info when we are resetting the rings. Ideally we do this when we configure the ring to bring it back up instead of when we are taking it down in order to avoid dirtying pages we don't need to. In addition we don't need to clear the Tx descriptor ring since we will fully repopulate it when we begin transmitting frames and next_to_watch can be cleared to prevent the ring from being cleaned beyond that point instead of needing to touch anything in the Tx descriptor ring. Finally with these changes we can avoid having to reset the skb member of the Tx buffer_info structure in the cleanup path since the skb will always be associated with the first buffer which has next_to_watch set. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 1 - drivers/net/ethernet/intel/igb/igb_ethtool.c | 11 +- drivers/net/ethernet/intel/igb/igb_main.c | 120 +++++++++++-------- 3 files changed, 83 insertions(+), 49 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 87c9fe9d6f18..a638254f4e06 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -594,7 +594,6 @@ void igb_configure_rx_ring(struct igb_adapter *, struct igb_ring *); void igb_setup_tctl(struct igb_adapter *); void igb_setup_rctl(struct igb_adapter *); netdev_tx_t igb_xmit_frame_ring(struct sk_buff *, struct igb_ring *); -void igb_unmap_and_free_tx_resource(struct igb_ring *, struct igb_tx_buffer *); void igb_alloc_rx_buffers(struct igb_ring *, u16); void igb_update_stats(struct igb_adapter *, struct rtnl_link_stats64 *); bool igb_has_link(struct igb_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 3f5f7744c90f..612cf13b7a3a 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -1833,7 +1833,16 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring, /* unmap buffer on Tx side */ tx_buffer_info = &tx_ring->tx_buffer_info[tx_ntc]; - igb_unmap_and_free_tx_resource(tx_ring, tx_buffer_info); + + /* Free all the Tx ring sk_buffs */ + dev_kfree_skb_any(tx_buffer_info->skb); + + /* unmap skb header data */ + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer_info, dma), + dma_unmap_len(tx_buffer_info, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer_info, len, 0); /* increment Rx/Tx next to clean counters */ rx_ntc++; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 680f0d3d9b72..e541cccaae1b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3293,7 +3293,7 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring) size = sizeof(struct igb_tx_buffer) * tx_ring->count; - tx_ring->tx_buffer_info = vzalloc(size); + tx_ring->tx_buffer_info = vmalloc(size); if (!tx_ring->tx_buffer_info) goto err; @@ -3404,6 +3404,10 @@ void igb_configure_tx_ring(struct igb_adapter *adapter, txdctl |= IGB_TX_HTHRESH << 8; txdctl |= IGB_TX_WTHRESH << 16; + /* reinitialize tx_buffer_info */ + memset(ring->tx_buffer_info, 0, + sizeof(struct igb_tx_buffer) * ring->count); + txdctl |= E1000_TXDCTL_QUEUE_ENABLE; wr32(E1000_TXDCTL(reg_idx), txdctl); } @@ -3831,55 +3835,63 @@ static void igb_free_all_tx_resources(struct igb_adapter *adapter) igb_free_tx_resources(adapter->tx_ring[i]); } -void igb_unmap_and_free_tx_resource(struct igb_ring *ring, - struct igb_tx_buffer *tx_buffer) -{ - if (tx_buffer->skb) { - dev_kfree_skb_any(tx_buffer->skb); - if (dma_unmap_len(tx_buffer, len)) - dma_unmap_single(ring->dev, - dma_unmap_addr(tx_buffer, dma), - dma_unmap_len(tx_buffer, len), - DMA_TO_DEVICE); - } else if (dma_unmap_len(tx_buffer, len)) { - dma_unmap_page(ring->dev, - dma_unmap_addr(tx_buffer, dma), - dma_unmap_len(tx_buffer, len), - DMA_TO_DEVICE); - } - tx_buffer->next_to_watch = NULL; - tx_buffer->skb = NULL; - dma_unmap_len_set(tx_buffer, len, 0); - /* buffer_info must be completely set up in the transmit path */ -} - /** * igb_clean_tx_ring - Free Tx Buffers * @tx_ring: ring to be cleaned **/ static void igb_clean_tx_ring(struct igb_ring *tx_ring) { - struct igb_tx_buffer *buffer_info; - unsigned long size; - u16 i; + u16 i = tx_ring->next_to_clean; + struct igb_tx_buffer *tx_buffer = &tx_ring->tx_buffer_info[i]; - if (!tx_ring->tx_buffer_info) - return; - /* Free all the Tx ring sk_buffs */ + while (i != tx_ring->next_to_use) { + union e1000_adv_tx_desc *eop_desc, *tx_desc; - for (i = 0; i < tx_ring->count; i++) { - buffer_info = &tx_ring->tx_buffer_info[i]; - igb_unmap_and_free_tx_resource(tx_ring, buffer_info); + /* Free all the Tx ring sk_buffs */ + dev_kfree_skb_any(tx_buffer->skb); + + /* unmap skb header data */ + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + + /* check for eop_desc to determine the end of the packet */ + eop_desc = tx_buffer->next_to_watch; + tx_desc = IGB_TX_DESC(tx_ring, i); + + /* unmap remaining buffers */ + while (tx_desc != eop_desc) { + tx_buffer++; + tx_desc++; + i++; + if (unlikely(i == tx_ring->count)) { + i = 0; + tx_buffer = tx_ring->tx_buffer_info; + tx_desc = IGB_TX_DESC(tx_ring, 0); + } + + /* unmap any remaining paged data */ + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_page(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + } + + /* move us one more past the eop_desc for start of next pkt */ + tx_buffer++; + i++; + if (unlikely(i == tx_ring->count)) { + i = 0; + tx_buffer = tx_ring->tx_buffer_info; + } } + /* reset BQL for queue */ netdev_tx_reset_queue(txring_txq(tx_ring)); - size = sizeof(struct igb_tx_buffer) * tx_ring->count; - memset(tx_ring->tx_buffer_info, 0, size); - - /* Zero out the descriptor ring */ - memset(tx_ring->desc, 0, tx_ring->size); - + /* reset next_to_use and next_to_clean */ tx_ring->next_to_use = 0; tx_ring->next_to_clean = 0; } @@ -5254,18 +5266,32 @@ static void igb_tx_map(struct igb_ring *tx_ring, dma_error: dev_err(tx_ring->dev, "TX DMA map failed\n"); + tx_buffer = &tx_ring->tx_buffer_info[i]; /* clear dma mappings for failed tx_buffer_info map */ - for (;;) { + while (tx_buffer != first) { + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_page(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer, len, 0); + + if (i--) + i += tx_ring->count; tx_buffer = &tx_ring->tx_buffer_info[i]; - igb_unmap_and_free_tx_resource(tx_ring, tx_buffer); - if (tx_buffer == first) - break; - if (i == 0) - i = tx_ring->count; - i--; } + if (dma_unmap_len(tx_buffer, len)) + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer, len, 0); + + dev_kfree_skb_any(tx_buffer->skb); + tx_buffer->skb = NULL; + tx_ring->next_to_use = i; } @@ -5337,7 +5363,8 @@ netdev_tx_t igb_xmit_frame_ring(struct sk_buff *skb, return NETDEV_TX_OK; out_drop: - igb_unmap_and_free_tx_resource(tx_ring, first); + dev_kfree_skb_any(first->skb); + first->skb = NULL; return NETDEV_TX_OK; } @@ -6684,7 +6711,6 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector, int napi_budget) DMA_TO_DEVICE); /* clear tx_buffer data */ - tx_buffer->skb = NULL; dma_unmap_len_set(tx_buffer, len, 0); /* clear last DMA location and unmap remaining buffers */ From cfbc871c2174f352542053d25659920d6841ed41 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:26:15 -0800 Subject: [PATCH 05/13] igb: Limit maximum frame Rx based on MTU In order to support the use of build_skb going forward it will be necessary to place a maximum limit on the amount of data we can receive when jumbo frames is not enabled. In order to do this I am adding a new upper limit for receive based on the size of a 2K buffer minus padding. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 10 +++++++++- drivers/net/ethernet/intel/igb/igb_main.c | 21 +++++++++++++++++---- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index a638254f4e06..a74928cc0e58 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -143,8 +143,17 @@ struct vf_data_storage { #define IGB_RXBUFFER_256 256 #define IGB_RXBUFFER_2048 2048 #define IGB_RX_HDR_LEN IGB_RXBUFFER_256 +#define IGB_TS_HDR_LEN 16 #define IGB_RX_BUFSZ IGB_RXBUFFER_2048 +#define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) +#if (PAGE_SIZE < 8192) +#define IGB_MAX_FRAME_BUILD_SKB \ + (SKB_WITH_OVERHEAD(IGB_RXBUFFER_2048) - IGB_SKB_PAD - IGB_TS_HDR_LEN) +#else +#define IGB_MAX_FRAME_BUILD_SKB (IGB_RXBUFFER_2048 - IGB_TS_HDR_LEN) +#endif + /* How many Rx Buffers do we bundle into one write to the hardware ? */ #define IGB_RX_BUFFER_WRITE 16 /* Must be power of 2 */ @@ -561,7 +570,6 @@ struct igb_adapter { #define IGB_DMCTLX_DCFLUSH_DIS 0x80000000 /* Disable DMA Coal Flush */ #define IGB_82576_TSYNC_SHIFT 19 -#define IGB_TS_HDR_LEN 16 enum e1000_state_t { __IGB_TESTING, __IGB_RESETTING, diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index e541cccaae1b..1bf31224dfac 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -4250,7 +4250,7 @@ static void igb_set_rx_mode(struct net_device *netdev) struct igb_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; unsigned int vfn = adapter->vfs_allocated_count; - u32 rctl = 0, vmolr = 0; + u32 rctl = 0, vmolr = 0, rlpml = MAX_JUMBO_FRAME_SIZE; int count; /* Check for Promiscuous and All Multicast modes */ @@ -4308,6 +4308,14 @@ static void igb_set_rx_mode(struct net_device *netdev) E1000_RCTL_VFE); wr32(E1000_RCTL, rctl); +#if (PAGE_SIZE < 8192) + if (!adapter->vfs_allocated_count) { + if (adapter->max_frame_size <= IGB_MAX_FRAME_BUILD_SKB) + rlpml = IGB_MAX_FRAME_BUILD_SKB; + } +#endif + wr32(E1000_RLPML, rlpml); + /* In order to support SR-IOV and eventually VMDq it is necessary to set * the VMOLR to enable the appropriate modes. Without this workaround * we will have issues with VLAN tag stripping not being done for frames @@ -4322,12 +4330,17 @@ static void igb_set_rx_mode(struct net_device *netdev) vmolr |= rd32(E1000_VMOLR(vfn)) & ~(E1000_VMOLR_ROPE | E1000_VMOLR_MPME | E1000_VMOLR_ROMPE); - /* enable Rx jumbo frames, no need for restriction */ + /* enable Rx jumbo frames, restrict as needed to support build_skb */ vmolr &= ~E1000_VMOLR_RLPML_MASK; - vmolr |= MAX_JUMBO_FRAME_SIZE | E1000_VMOLR_LPE; +#if (PAGE_SIZE < 8192) + if (adapter->max_frame_size <= IGB_MAX_FRAME_BUILD_SKB) + vmolr |= IGB_MAX_FRAME_BUILD_SKB; + else +#endif + vmolr |= MAX_JUMBO_FRAME_SIZE; + vmolr |= E1000_VMOLR_LPE; wr32(E1000_VMOLR(vfn), vmolr); - wr32(E1000_RLPML, MAX_JUMBO_FRAME_SIZE); igb_restore_vf_multicasts(adapter); } From cb0ef1d1dc8739e307f70493b7eb1184685dbbde Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:26:26 -0800 Subject: [PATCH 06/13] igb: Only sync size of expected frame in ethtool testing We only need to sync the size of the frame that is read to test. We don't need to sync the entire Rx buffer. This way the testing is more consistent with how we handle things in the receive path. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index 612cf13b7a3a..d5966feb7b96 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -1818,7 +1818,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring, /* sync Rx buffer for CPU read */ dma_sync_single_for_cpu(rx_ring->dev, rx_buffer_info->dma, - IGB_RX_BUFSZ, + size, DMA_FROM_DEVICE); /* verify contents of skb */ @@ -1828,7 +1828,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring, /* sync Rx buffer for device write */ dma_sync_single_for_device(rx_ring->dev, rx_buffer_info->dma, - IGB_RX_BUFSZ, + size, DMA_FROM_DEVICE); /* unmap buffer on Tx side */ From 3456fd53421e7f499395542d6c2e76e0b46ab4d3 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:26:40 -0800 Subject: [PATCH 07/13] igb: Use page_address offset from page instead of masking virtual address Update the handling of page addresses so that we always refer to them using a void pointer, and try to use the consistent name of va indicating we are working with a virtual address. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 11 +++++------ drivers/net/ethernet/intel/igb/igb_ptp.c | 3 +-- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index a74928cc0e58..6a88a08c021c 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -614,7 +614,7 @@ void igb_ptp_reset(struct igb_adapter *adapter); void igb_ptp_suspend(struct igb_adapter *adapter); void igb_ptp_rx_hang(struct igb_adapter *adapter); void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb); -void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, unsigned char *va, +void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, struct sk_buff *skb); int igb_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr); int igb_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr); diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 1bf31224dfac..44c590c5d5b7 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6927,7 +6927,7 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, struct sk_buff *skb) { struct page *page = rx_buffer->page; - unsigned char *va = page_address(page) + rx_buffer->page_offset; + void *va = page_address(page) + rx_buffer->page_offset; #if (PAGE_SIZE < 8192) unsigned int truesize = IGB_RX_BUFSZ; #else @@ -6969,7 +6969,7 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, add_tail_frag: skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, - (unsigned long)va & ~PAGE_MASK, size, truesize); + va - page_address(page), size, truesize); return igb_can_reuse_rx_page(rx_buffer, page, truesize); } @@ -6994,13 +6994,12 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, DMA_FROM_DEVICE); if (likely(!skb)) { - void *page_addr = page_address(page) + - rx_buffer->page_offset; + void *va = page_address(page) + rx_buffer->page_offset; /* prefetch first cache line of first page */ - prefetch(page_addr); + prefetch(va); #if L1_CACHE_BYTES < 128 - prefetch(page_addr + L1_CACHE_BYTES); + prefetch(va + L1_CACHE_BYTES); #endif /* allocate a skb to store the frags */ diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c index c4477552ce9e..7a3fd4d74592 100644 --- a/drivers/net/ethernet/intel/igb/igb_ptp.c +++ b/drivers/net/ethernet/intel/igb/igb_ptp.c @@ -764,8 +764,7 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter) * incoming frame. The value is stored in little endian format starting on * byte 8. **/ -void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, - unsigned char *va, +void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va, struct sk_buff *skb) { __le64 *regval = (__le64 *)va; From e08912985b296b33b18a563cc126e3e2f018c2e1 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:26:52 -0800 Subject: [PATCH 08/13] igb: Add support for ethtool private flag to allow use of legacy Rx Since there are potential drawbacks to the new Rx allocation approach I thought it best to add a "chicken bit" so that we can turn the feature off if in the event that a problem is found. It also provides a means of validating the legacy Rx path in the event that we are forced to fall back. At some point in the future when we are convinced we don't need it anymore we might be able to drop the legacy-rx flag. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 1 + drivers/net/ethernet/intel/igb/igb_ethtool.c | 48 ++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 6a88a08c021c..bffdfe65a0b6 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -557,6 +557,7 @@ struct igb_adapter { #define IGB_FLAG_HAS_MSIX BIT(13) #define IGB_FLAG_EEE BIT(14) #define IGB_FLAG_VLAN_PROMISC BIT(15) +#define IGB_FLAG_RX_LEGACY BIT(16) /* Media Auto Sense */ #define IGB_MAS_ENABLE_0 0X0001 diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index d5966feb7b96..797b9daba224 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -144,6 +144,13 @@ static const char igb_gstrings_test[][ETH_GSTRING_LEN] = { }; #define IGB_TEST_LEN (sizeof(igb_gstrings_test) / ETH_GSTRING_LEN) +static const char igb_priv_flags_strings[][ETH_GSTRING_LEN] = { +#define IGB_PRIV_FLAGS_LEGACY_RX BIT(0) + "legacy-rx", +}; + +#define IGB_PRIV_FLAGS_STR_LEN ARRAY_SIZE(igb_priv_flags_strings) + static int igb_get_settings(struct net_device *netdev, struct ethtool_cmd *ecmd) { struct igb_adapter *adapter = netdev_priv(netdev); @@ -852,6 +859,8 @@ static void igb_get_drvinfo(struct net_device *netdev, sizeof(drvinfo->fw_version)); strlcpy(drvinfo->bus_info, pci_name(adapter->pdev), sizeof(drvinfo->bus_info)); + + drvinfo->n_priv_flags = IGB_PRIV_FLAGS_STR_LEN; } static void igb_get_ringparam(struct net_device *netdev, @@ -2280,6 +2289,8 @@ static int igb_get_sset_count(struct net_device *netdev, int sset) return IGB_STATS_LEN; case ETH_SS_TEST: return IGB_TEST_LEN; + case ETH_SS_PRIV_FLAGS: + return IGB_PRIV_FLAGS_STR_LEN; default: return -ENOTSUPP; } @@ -2385,6 +2396,10 @@ static void igb_get_strings(struct net_device *netdev, u32 stringset, u8 *data) } /* BUG_ON(p - data != IGB_STATS_LEN * ETH_GSTRING_LEN); */ break; + case ETH_SS_PRIV_FLAGS: + memcpy(data, igb_priv_flags_strings, + IGB_PRIV_FLAGS_STR_LEN * ETH_GSTRING_LEN); + break; } } @@ -3397,6 +3412,37 @@ static int igb_set_channels(struct net_device *netdev, return 0; } +static u32 igb_get_priv_flags(struct net_device *netdev) +{ + struct igb_adapter *adapter = netdev_priv(netdev); + u32 priv_flags = 0; + + if (adapter->flags & IGB_FLAG_RX_LEGACY) + priv_flags |= IGB_PRIV_FLAGS_LEGACY_RX; + + return priv_flags; +} + +static int igb_set_priv_flags(struct net_device *netdev, u32 priv_flags) +{ + struct igb_adapter *adapter = netdev_priv(netdev); + unsigned int flags = adapter->flags; + + flags &= ~IGB_FLAG_RX_LEGACY; + if (priv_flags & IGB_PRIV_FLAGS_LEGACY_RX) + flags |= IGB_FLAG_RX_LEGACY; + + if (flags != adapter->flags) { + adapter->flags = flags; + + /* reset interface to repopulate queues */ + if (netif_running(netdev)) + igb_reinit_locked(adapter); + } + + return 0; +} + static const struct ethtool_ops igb_ethtool_ops = { .get_settings = igb_get_settings, .set_settings = igb_set_settings, @@ -3435,6 +3481,8 @@ static const struct ethtool_ops igb_ethtool_ops = { .set_rxfh = igb_set_rxfh, .get_channels = igb_get_channels, .set_channels = igb_set_channels, + .get_priv_flags = igb_get_priv_flags, + .set_priv_flags = igb_set_priv_flags, .begin = igb_ethtool_begin, .complete = igb_ethtool_complete, }; From 8649aaef4044681257ed38cf8706aea88430f2c4 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:27:03 -0800 Subject: [PATCH 09/13] igb: Add support for using order 1 pages to receive large frames This patch adds support for using 3K buffers in order 1 pages the same way we were using 2K buffers in 4K pages. We are reserving 1K of room for now to have space available for future headroom and tailroom when we enable build_skb support. One side effect of this patch is that we can end up using a larger buffer if jumbo frames is enabled. The impact shouldn't be too great, but it could hurt small packet performance for UDP workloads if jumbo frames is enabled as the truesize of frames will be larger. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 30 ++++++++++- drivers/net/ethernet/intel/igb/igb_main.c | 64 +++++++++++++++++------ 2 files changed, 76 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index bffdfe65a0b6..eb91c87e0c1d 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -142,9 +142,9 @@ struct vf_data_storage { /* Supported Rx Buffer Sizes */ #define IGB_RXBUFFER_256 256 #define IGB_RXBUFFER_2048 2048 +#define IGB_RXBUFFER_3072 3072 #define IGB_RX_HDR_LEN IGB_RXBUFFER_256 #define IGB_TS_HDR_LEN 16 -#define IGB_RX_BUFSZ IGB_RXBUFFER_2048 #define IGB_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN) #if (PAGE_SIZE < 8192) @@ -313,12 +313,40 @@ struct igb_q_vector { }; enum e1000_ring_flags_t { + IGB_RING_FLAG_RX_3K_BUFFER, IGB_RING_FLAG_RX_SCTP_CSUM, IGB_RING_FLAG_RX_LB_VLAN_BSWAP, IGB_RING_FLAG_TX_CTX_IDX, IGB_RING_FLAG_TX_DETECT_HANG }; +#define ring_uses_large_buffer(ring) \ + test_bit(IGB_RING_FLAG_RX_3K_BUFFER, &(ring)->flags) +#define set_ring_uses_large_buffer(ring) \ + set_bit(IGB_RING_FLAG_RX_3K_BUFFER, &(ring)->flags) +#define clear_ring_uses_large_buffer(ring) \ + clear_bit(IGB_RING_FLAG_RX_3K_BUFFER, &(ring)->flags) + +static inline unsigned int igb_rx_bufsz(struct igb_ring *ring) +{ +#if (PAGE_SIZE < 8192) + if (ring_uses_large_buffer(ring)) + return IGB_RXBUFFER_3072; +#endif + return IGB_RXBUFFER_2048; +} + +static inline unsigned int igb_rx_pg_order(struct igb_ring *ring) +{ +#if (PAGE_SIZE < 8192) + if (ring_uses_large_buffer(ring)) + return 1; +#endif + return 0; +} + +#define igb_rx_pg_size(_ring) (PAGE_SIZE << igb_rx_pg_order(_ring)) + #define IGB_TXD_DCMD (E1000_ADVTXD_DCMD_EOP | E1000_ADVTXD_DCMD_RS) #define IGB_RX_DESC(R, i) \ diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 44c590c5d5b7..24c20d401240 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -554,7 +554,7 @@ static void igb_dump(struct igb_adapter *adapter) 16, 1, page_address(buffer_info->page) + buffer_info->page_offset, - IGB_RX_BUFSZ, true); + igb_rx_bufsz(rx_ring), true); } } } @@ -3746,7 +3746,10 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, /* set descriptor configuration */ srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT; - srrctl |= IGB_RX_BUFSZ >> E1000_SRRCTL_BSIZEPKT_SHIFT; + if (ring_uses_large_buffer(ring)) + srrctl |= IGB_RXBUFFER_3072 >> E1000_SRRCTL_BSIZEPKT_SHIFT; + else + srrctl |= IGB_RXBUFFER_2048 >> E1000_SRRCTL_BSIZEPKT_SHIFT; srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF; if (hw->mac.type >= e1000_82580) srrctl |= E1000_SRRCTL_TIMESTAMP; @@ -3776,6 +3779,23 @@ void igb_configure_rx_ring(struct igb_adapter *adapter, wr32(E1000_RXDCTL(reg_idx), rxdctl); } +static void igb_set_rx_buffer_len(struct igb_adapter *adapter, + struct igb_ring *rx_ring) +{ + /* set build_skb and buffer size flags */ + clear_ring_uses_large_buffer(rx_ring); + + if (adapter->flags & IGB_FLAG_RX_LEGACY) + return; + +#if (PAGE_SIZE < 8192) + if (adapter->max_frame_size <= IGB_MAX_FRAME_BUILD_SKB) + return; + + set_ring_uses_large_buffer(rx_ring); +#endif +} + /** * igb_configure_rx - Configure receive Unit after Reset * @adapter: board private structure @@ -3793,8 +3813,12 @@ static void igb_configure_rx(struct igb_adapter *adapter) /* Setup the HW Rx Head and Tail Descriptor Pointers and * the Base and Length of the Rx Descriptor Ring */ - for (i = 0; i < adapter->num_rx_queues; i++) - igb_configure_rx_ring(adapter, adapter->rx_ring[i]); + for (i = 0; i < adapter->num_rx_queues; i++) { + struct igb_ring *rx_ring = adapter->rx_ring[i]; + + igb_set_rx_buffer_len(adapter, rx_ring); + igb_configure_rx_ring(adapter, rx_ring); + } } /** @@ -3969,13 +3993,13 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) dma_sync_single_range_for_cpu(rx_ring->dev, buffer_info->dma, buffer_info->page_offset, - IGB_RX_BUFSZ, + igb_rx_bufsz(rx_ring), DMA_FROM_DEVICE); /* free resources associated with mapping */ dma_unmap_page_attrs(rx_ring->dev, buffer_info->dma, - PAGE_SIZE, + igb_rx_pg_size(rx_ring), DMA_FROM_DEVICE, IGB_RX_DMA_ATTR); __page_frag_cache_drain(buffer_info->page, @@ -6870,7 +6894,7 @@ static inline bool igb_page_is_reserved(struct page *page) static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, struct page *page, - unsigned int truesize) + const unsigned int truesize) { unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; @@ -6884,12 +6908,14 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, return false; /* flip page offset to other buffer */ - rx_buffer->page_offset ^= IGB_RX_BUFSZ; + rx_buffer->page_offset ^= truesize; #else /* move offset up to the next cache line */ rx_buffer->page_offset += truesize; +#define IGB_LAST_OFFSET \ + (SKB_WITH_OVERHEAD(PAGE_SIZE) - IGB_RXBUFFER_2048) - if (rx_buffer->page_offset > (PAGE_SIZE - IGB_RX_BUFSZ)) + if (rx_buffer->page_offset > IGB_LAST_OFFSET) return false; #endif @@ -6929,7 +6955,7 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, struct page *page = rx_buffer->page; void *va = page_address(page) + rx_buffer->page_offset; #if (PAGE_SIZE < 8192) - unsigned int truesize = IGB_RX_BUFSZ; + unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; #else unsigned int truesize = SKB_DATA_ALIGN(size); #endif @@ -7025,7 +7051,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, * any references we are holding to it */ dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, - PAGE_SIZE, DMA_FROM_DEVICE, + igb_rx_pg_size(rx_ring), DMA_FROM_DEVICE, IGB_RX_DMA_ATTR); __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } @@ -7278,21 +7304,23 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, return true; /* alloc new page for storage */ - page = dev_alloc_page(); + page = dev_alloc_pages(igb_rx_pg_order(rx_ring)); if (unlikely(!page)) { rx_ring->rx_stats.alloc_failed++; return false; } /* map page for use */ - dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE, - DMA_FROM_DEVICE, IGB_RX_DMA_ATTR); + dma = dma_map_page_attrs(rx_ring->dev, page, 0, + igb_rx_pg_size(rx_ring), + DMA_FROM_DEVICE, + IGB_RX_DMA_ATTR); /* if mapping failed free memory back to system since * there isn't much point in holding memory we can't use */ if (dma_mapping_error(rx_ring->dev, dma)) { - __free_page(page); + __free_pages(page, igb_rx_pg_order(rx_ring)); rx_ring->rx_stats.alloc_failed++; return false; @@ -7315,6 +7343,7 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) union e1000_adv_rx_desc *rx_desc; struct igb_rx_buffer *bi; u16 i = rx_ring->next_to_use; + u16 bufsz; /* nothing to do */ if (!cleaned_count) @@ -7324,14 +7353,15 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count) bi = &rx_ring->rx_buffer_info[i]; i -= rx_ring->count; + bufsz = igb_rx_bufsz(rx_ring); + do { if (!igb_alloc_mapped_page(rx_ring, bi)) break; /* sync the buffer for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, bi->dma, - bi->page_offset, - IGB_RX_BUFSZ, + bi->page_offset, bufsz, DMA_FROM_DEVICE); /* Refresh the desc even if buffer_addrs didn't change From e3cdf68d4a861d91ef62ed615483e673f07fccfe Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:27:14 -0800 Subject: [PATCH 10/13] igb: Add support for padding packet With the size of the frame limited we can now write to an offset within the buffer instead of having to write at the very start of the buffer. The advantage to this is that it allows us to leave padding room for things like supporting XDP in the future. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 11 +++++++++++ drivers/net/ethernet/intel/igb/igb_main.c | 14 ++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index eb91c87e0c1d..dc6e2980718f 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -314,6 +314,7 @@ struct igb_q_vector { enum e1000_ring_flags_t { IGB_RING_FLAG_RX_3K_BUFFER, + IGB_RING_FLAG_RX_BUILD_SKB_ENABLED, IGB_RING_FLAG_RX_SCTP_CSUM, IGB_RING_FLAG_RX_LB_VLAN_BSWAP, IGB_RING_FLAG_TX_CTX_IDX, @@ -327,11 +328,21 @@ enum e1000_ring_flags_t { #define clear_ring_uses_large_buffer(ring) \ clear_bit(IGB_RING_FLAG_RX_3K_BUFFER, &(ring)->flags) +#define ring_uses_build_skb(ring) \ + test_bit(IGB_RING_FLAG_RX_BUILD_SKB_ENABLED, &(ring)->flags) +#define set_ring_build_skb_enabled(ring) \ + set_bit(IGB_RING_FLAG_RX_BUILD_SKB_ENABLED, &(ring)->flags) +#define clear_ring_build_skb_enabled(ring) \ + clear_bit(IGB_RING_FLAG_RX_BUILD_SKB_ENABLED, &(ring)->flags) + static inline unsigned int igb_rx_bufsz(struct igb_ring *ring) { #if (PAGE_SIZE < 8192) if (ring_uses_large_buffer(ring)) return IGB_RXBUFFER_3072; + + if (ring_uses_build_skb(ring)) + return IGB_MAX_FRAME_BUILD_SKB + IGB_TS_HDR_LEN; #endif return IGB_RXBUFFER_2048; } diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 24c20d401240..3ef66577872b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3783,11 +3783,14 @@ static void igb_set_rx_buffer_len(struct igb_adapter *adapter, struct igb_ring *rx_ring) { /* set build_skb and buffer size flags */ + clear_ring_build_skb_enabled(rx_ring); clear_ring_uses_large_buffer(rx_ring); if (adapter->flags & IGB_FLAG_RX_LEGACY) return; + set_ring_build_skb_enabled(rx_ring); + #if (PAGE_SIZE < 8192) if (adapter->max_frame_size <= IGB_MAX_FRAME_BUILD_SKB) return; @@ -6957,7 +6960,9 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, #if (PAGE_SIZE < 8192) unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; #else - unsigned int truesize = SKB_DATA_ALIGN(size); + unsigned int truesize = ring_uses_build_skb(rx_ring) ? + SKB_DATA_ALIGN(IGB_SKB_PAD + size) : + SKB_DATA_ALIGN(size); #endif unsigned int pull_len; @@ -7293,6 +7298,11 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) return total_packets; } +static inline unsigned int igb_rx_offset(struct igb_ring *rx_ring) +{ + return ring_uses_build_skb(rx_ring) ? IGB_SKB_PAD : 0; +} + static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, struct igb_rx_buffer *bi) { @@ -7328,7 +7338,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, bi->dma = dma; bi->page = page; - bi->page_offset = 0; + bi->page_offset = igb_rx_offset(rx_ring); bi->pagecnt_bias = 1; return true; From e014272672b964471608a2624e4cdf1d5e7c22ea Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:27:26 -0800 Subject: [PATCH 11/13] igb: Break out Rx buffer page management At this point we have 2 to 3 paths that can be taken depending on what Rx modes are enabled. In order to better support that and improve the maintainability I am breaking out the common bits from those paths and making them into their own functions. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 235 +++++++++++----------- 1 file changed, 121 insertions(+), 114 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 3ef66577872b..dfae641647d3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6886,8 +6886,14 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, nta++; rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0; - /* transfer page from old buffer to new buffer */ - *new_buff = *old_buff; + /* Transfer page from old buffer to new buffer. + * Move each member individually to avoid possible store + * forwarding stalls. + */ + new_buff->dma = old_buff->dma; + new_buff->page = old_buff->page; + new_buff->page_offset = old_buff->page_offset; + new_buff->pagecnt_bias = old_buff->pagecnt_bias; } static inline bool igb_page_is_reserved(struct page *page) @@ -6895,11 +6901,10 @@ static inline bool igb_page_is_reserved(struct page *page) return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } -static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, - struct page *page, - const unsigned int truesize) +static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer) { - unsigned int pagecnt_bias = rx_buffer->pagecnt_bias--; + unsigned int pagecnt_bias = rx_buffer->pagecnt_bias; + struct page *page = rx_buffer->page; /* avoid re-using remote pages */ if (unlikely(igb_page_is_reserved(page))) @@ -6907,14 +6912,9 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, #if (PAGE_SIZE < 8192) /* if we are only owner of page we can reuse it */ - if (unlikely(page_ref_count(page) != pagecnt_bias)) + if (unlikely((page_ref_count(page) - pagecnt_bias) > 1)) return false; - - /* flip page offset to other buffer */ - rx_buffer->page_offset ^= truesize; #else - /* move offset up to the next cache line */ - rx_buffer->page_offset += truesize; #define IGB_LAST_OFFSET \ (SKB_WITH_OVERHEAD(PAGE_SIZE) - IGB_RXBUFFER_2048) @@ -6926,7 +6926,7 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, * the pagecnt_bias and page count so that we fully restock the * number of references the driver holds. */ - if (unlikely(pagecnt_bias == 1)) { + if (unlikely(!pagecnt_bias)) { page_ref_add(page, USHRT_MAX); rx_buffer->pagecnt_bias = USHRT_MAX; } @@ -6938,25 +6938,16 @@ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, * igb_add_rx_frag - Add contents of Rx buffer to sk_buff * @rx_ring: rx descriptor ring to transact packets on * @rx_buffer: buffer containing page to add - * @rx_desc: descriptor containing length of buffer written by hardware * @skb: sk_buff to place the data into + * @size: size of buffer to be added * * This function will add the data contained in rx_buffer->page to the skb. - * This is done either through a direct copy if the data in the buffer is - * less than the skb header size, otherwise it will just attach the page as - * a frag to the skb. - * - * The function will then update the page offset if necessary and return - * true if the buffer can be reused by the adapter. **/ -static bool igb_add_rx_frag(struct igb_ring *rx_ring, +static void igb_add_rx_frag(struct igb_ring *rx_ring, struct igb_rx_buffer *rx_buffer, - unsigned int size, - union e1000_adv_rx_desc *rx_desc, - struct sk_buff *skb) + struct sk_buff *skb, + unsigned int size) { - struct page *page = rx_buffer->page; - void *va = page_address(page) + rx_buffer->page_offset; #if (PAGE_SIZE < 8192) unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; #else @@ -6964,10 +6955,39 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, SKB_DATA_ALIGN(IGB_SKB_PAD + size) : SKB_DATA_ALIGN(size); #endif - unsigned int pull_len; + skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, + rx_buffer->page_offset, size, truesize); +#if (PAGE_SIZE < 8192) + rx_buffer->page_offset ^= truesize; +#else + rx_buffer->page_offset += truesize; +#endif +} - if (unlikely(skb_is_nonlinear(skb))) - goto add_tail_frag; +static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, + struct igb_rx_buffer *rx_buffer, + union e1000_adv_rx_desc *rx_desc, + unsigned int size) +{ + void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; +#if (PAGE_SIZE < 8192) + unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; +#else + unsigned int truesize = SKB_DATA_ALIGN(size); +#endif + unsigned int headlen; + struct sk_buff *skb; + + /* prefetch first cache line of first page */ + prefetch(va); +#if L1_CACHE_BYTES < 128 + prefetch(va + L1_CACHE_BYTES); +#endif + + /* allocate a skb to store the frags */ + skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN); + if (unlikely(!skb)) + return NULL; if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) { igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); @@ -6975,95 +6995,29 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, size -= IGB_TS_HDR_LEN; } - if (likely(size <= IGB_RX_HDR_LEN)) { - memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long))); - - /* page is not reserved, we can reuse buffer as-is */ - if (likely(!igb_page_is_reserved(page))) - return true; - - /* this page cannot be reused so discard it */ - return false; - } - - /* we need the header to contain the greater of either ETH_HLEN or - * 60 bytes if the skb->len is less than 60 for skb_pad. - */ - pull_len = eth_get_headlen(va, IGB_RX_HDR_LEN); + /* Determine available headroom for copy */ + headlen = size; + if (headlen > IGB_RX_HDR_LEN) + headlen = eth_get_headlen(va, IGB_RX_HDR_LEN); /* align pull length to size of long to optimize memcpy performance */ - memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long))); + memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long))); /* update all of the pointers */ - va += pull_len; - size -= pull_len; - -add_tail_frag: - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, - va - page_address(page), size, truesize); - - return igb_can_reuse_rx_page(rx_buffer, page, truesize); -} - -static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, - union e1000_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - unsigned int size = le16_to_cpu(rx_desc->wb.upper.length); - struct igb_rx_buffer *rx_buffer; - struct page *page; - - rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; - page = rx_buffer->page; - prefetchw(page); - - /* we are reusing so sync this buffer for CPU use */ - dma_sync_single_range_for_cpu(rx_ring->dev, - rx_buffer->dma, - rx_buffer->page_offset, - size, - DMA_FROM_DEVICE); - - if (likely(!skb)) { - void *va = page_address(page) + rx_buffer->page_offset; - - /* prefetch first cache line of first page */ - prefetch(va); -#if L1_CACHE_BYTES < 128 - prefetch(va + L1_CACHE_BYTES); + size -= headlen; + if (size) { + skb_add_rx_frag(skb, 0, rx_buffer->page, + (va + headlen) - page_address(rx_buffer->page), + size, truesize); +#if (PAGE_SIZE < 8192) + rx_buffer->page_offset ^= truesize; +#else + rx_buffer->page_offset += truesize; #endif - - /* allocate a skb to store the frags */ - skb = napi_alloc_skb(&rx_ring->q_vector->napi, IGB_RX_HDR_LEN); - if (unlikely(!skb)) { - rx_ring->rx_stats.alloc_failed++; - return NULL; - } - - /* we will be copying header into skb->data in - * pskb_may_pull so it is in our interest to prefetch - * it now to avoid a possible cache miss - */ - prefetchw(skb->data); - } - - /* pull page into skb */ - if (igb_add_rx_frag(rx_ring, rx_buffer, size, rx_desc, skb)) { - /* hand second half of page back to the ring */ - igb_reuse_rx_page(rx_ring, rx_buffer); } else { - /* We are not reusing the buffer so unmap it and free - * any references we are holding to it - */ - dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, - igb_rx_pg_size(rx_ring), DMA_FROM_DEVICE, - IGB_RX_DMA_ATTR); - __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); + rx_buffer->pagecnt_bias++; } - /* clear contents of rx_buffer */ - rx_buffer->page = NULL; - return skb; } @@ -7221,6 +7175,47 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring, skb->protocol = eth_type_trans(skb, rx_ring->netdev); } +static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring, + const unsigned int size) +{ + struct igb_rx_buffer *rx_buffer; + + rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean]; + prefetchw(rx_buffer->page); + + /* we are reusing so sync this buffer for CPU use */ + dma_sync_single_range_for_cpu(rx_ring->dev, + rx_buffer->dma, + rx_buffer->page_offset, + size, + DMA_FROM_DEVICE); + + rx_buffer->pagecnt_bias--; + + return rx_buffer; +} + +static void igb_put_rx_buffer(struct igb_ring *rx_ring, + struct igb_rx_buffer *rx_buffer) +{ + if (igb_can_reuse_rx_page(rx_buffer)) { + /* hand second half of page back to the ring */ + igb_reuse_rx_page(rx_ring, rx_buffer); + } else { + /* We are not reusing the buffer so unmap it and free + * any references we are holding to it + */ + dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, + igb_rx_pg_size(rx_ring), DMA_FROM_DEVICE, + IGB_RX_DMA_ATTR); + __page_frag_cache_drain(rx_buffer->page, + rx_buffer->pagecnt_bias); + } + + /* clear contents of rx_buffer */ + rx_buffer->page = NULL; +} + static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) { struct igb_ring *rx_ring = q_vector->rx.ring; @@ -7230,6 +7225,8 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) while (likely(total_packets < budget)) { union e1000_adv_rx_desc *rx_desc; + struct igb_rx_buffer *rx_buffer; + unsigned int size; /* return some buffers to hardware, one at a time is too slow */ if (cleaned_count >= IGB_RX_BUFFER_WRITE) { @@ -7238,8 +7235,8 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) } rx_desc = IGB_RX_DESC(rx_ring, rx_ring->next_to_clean); - - if (!rx_desc->wb.upper.length) + size = le16_to_cpu(rx_desc->wb.upper.length); + if (!size) break; /* This memory barrier is needed to keep us from reading @@ -7248,13 +7245,23 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) */ dma_rmb(); + rx_buffer = igb_get_rx_buffer(rx_ring, size); + /* retrieve a buffer from the ring */ - skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb); + if (skb) + igb_add_rx_frag(rx_ring, rx_buffer, skb, size); + else + skb = igb_construct_skb(rx_ring, rx_buffer, + rx_desc, size); /* exit if we failed to retrieve a buffer */ - if (!skb) + if (!skb) { + rx_ring->rx_stats.alloc_failed++; + rx_buffer->pagecnt_bias++; break; + } + igb_put_rx_buffer(rx_ring, rx_buffer); cleaned_count++; /* fetch next buffer in frame if non-eop */ From b1bb2eb0a0deb03e9847d8e29eb1e75ce141e4d9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 6 Feb 2017 18:27:36 -0800 Subject: [PATCH 12/13] igb: Re-add support for build_skb in igb This reverts commit f9d40f6a9921 ("igb: Revert support for build_skb in igb") and adds a few changes to update it to work with the latest version of igb. We are now able to revert the removal of this due to the fact that with the recent changes to the page count and the use of DMA_ATTR_SKIP_CPU_SYNC we can make the pages writable so we should not be invalidating the additional data added when we call build_skb. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 47 +++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index dfae641647d3..79f39a785dca 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7021,6 +7021,51 @@ static struct sk_buff *igb_construct_skb(struct igb_ring *rx_ring, return skb; } +static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, + struct igb_rx_buffer *rx_buffer, + union e1000_adv_rx_desc *rx_desc, + unsigned int size) +{ + void *va = page_address(rx_buffer->page) + rx_buffer->page_offset; +#if (PAGE_SIZE < 8192) + unsigned int truesize = igb_rx_pg_size(rx_ring) / 2; +#else + unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + + SKB_DATA_ALIGN(IGB_SKB_PAD + size); +#endif + struct sk_buff *skb; + + /* prefetch first cache line of first page */ + prefetch(va); +#if L1_CACHE_BYTES < 128 + prefetch(va + L1_CACHE_BYTES); +#endif + + /* build an skb to around the page buffer */ + skb = build_skb(va - IGB_SKB_PAD, truesize); + if (unlikely(!skb)) + return NULL; + + /* update pointers within the skb to store the data */ + skb_reserve(skb, IGB_SKB_PAD); + __skb_put(skb, size); + + /* pull timestamp out of packet data */ + if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { + igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb); + __skb_pull(skb, IGB_TS_HDR_LEN); + } + + /* update buffer offset */ +#if (PAGE_SIZE < 8192) + rx_buffer->page_offset ^= truesize; +#else + rx_buffer->page_offset += truesize; +#endif + + return skb; +} + static inline void igb_rx_checksum(struct igb_ring *ring, union e1000_adv_rx_desc *rx_desc, struct sk_buff *skb) @@ -7250,6 +7295,8 @@ static int igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget) /* retrieve a buffer from the ring */ if (skb) igb_add_rx_frag(rx_ring, rx_buffer, skb, size); + else if (ring_uses_build_skb(rx_ring)) + skb = igb_build_skb(rx_ring, rx_buffer, rx_desc, size); else skb = igb_construct_skb(rx_ring, rx_buffer, rx_desc, size); From 3a1eb6d10c9350fa7adce850a752693460ac62d6 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 15 Feb 2017 09:15:59 -0800 Subject: [PATCH 13/13] igb/ixgbe: Fix typo in igb_build_skb and/or ixgbe_build_skb code comment There was a typo that I had left in the code comments for the igb and ixgbe functions that enabled build_skb support. Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 79f39a785dca..26a821fcd220 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7041,7 +7041,7 @@ static struct sk_buff *igb_build_skb(struct igb_ring *rx_ring, prefetch(va + L1_CACHE_BYTES); #endif - /* build an skb to around the page buffer */ + /* build an skb around the page buffer */ skb = build_skb(va - IGB_SKB_PAD, truesize); if (unlikely(!skb)) return NULL; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index d45477db0227..852a2e7e25ed 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -2122,7 +2122,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring, prefetch(va + L1_CACHE_BYTES); #endif - /* build an skb to around the page buffer */ + /* build an skb around the page buffer */ skb = build_skb(va - IXGBE_SKB_PAD, truesize); if (unlikely(!skb)) return NULL;