Merge branch 'aquantia-rx-perf'

Igor Russkikh says:

====================
net: aquantia: RX performance optimization patches

Here is a set of patches targeting for performance improvement
on various platforms and protocols.

Our main target was rx performance on iommu systems, notably
NVIDIA Jetson TX2 and NVIDIA Xavier platforms.

We introduce page reuse strategy to better deal with iommu dma mapping costs.
With it we see 80-90% of page reuse under some test configurations on UDP traffic.

This shows good improvements on other systems with IOMMU hardware, like
AMD Ryzen.

We've also improved TCP LRO configuration parameters, allowing packets to better
coalesce.

Page reuse tests were carried out using iperf3, iperf2, netperf and pktgen.
Mainly on UDP traffic, with various packet lengths.

Jetson TX2, UDP, Default MTU:
RX Lost Datagrams
  Before: Max: 69%  Min: 68% Avg: 68.5%
  After:  Max: 41%  Min: 38% Avg: 39.2%
Maximum throughput
  Before: 1.27 Gbits/sec
  After:  2.41 Gbits/sec

AMD Ryzen 5 2400G, UDP, Default MTU:
RX Lost Datagrams
  Before:  Max: 12%  Min: 4.5% Avg: 7.17%
  After:   Max: 6.2% Min: 2.3% Avg: 4.26%
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2019-03-23 22:16:54 -04:00
commit 956ca8fc5c
13 changed files with 224 additions and 73 deletions

View file

@ -17,7 +17,8 @@ if NET_VENDOR_AQUANTIA
config AQTION
tristate "aQuantia AQtion(tm) Support"
depends on PCI && X86_64
depends on PCI
depends on X86_64 || ARM64 || COMPILE_TEST
---help---
This enables the support for the aQuantia AQtion(tm) Ethernet card.

View file

@ -16,7 +16,7 @@
#define AQ_CFG_TCS_DEF 1U
#define AQ_CFG_TXDS_DEF 4096U
#define AQ_CFG_RXDS_DEF 1024U
#define AQ_CFG_RXDS_DEF 2048U
#define AQ_CFG_IS_POLLING_DEF 0U
@ -34,10 +34,16 @@
#define AQ_CFG_TCS_MAX 8U
#define AQ_CFG_TX_FRAME_MAX (16U * 1024U)
#define AQ_CFG_RX_FRAME_MAX (4U * 1024U)
#define AQ_CFG_RX_FRAME_MAX (2U * 1024U)
#define AQ_CFG_TX_CLEAN_BUDGET 256U
#define AQ_CFG_RX_REFILL_THRES 32U
#define AQ_CFG_RX_HDR_SIZE 256U
#define AQ_CFG_RX_PAGEORDER 0U
/* LRO */
#define AQ_CFG_IS_LRO_DEF 1U

View file

@ -73,6 +73,7 @@ void aq_nic_cfg_start(struct aq_nic_s *self)
cfg->tx_itr = aq_itr_tx;
cfg->rx_itr = aq_itr_rx;
cfg->rxpageorder = AQ_CFG_RX_PAGEORDER;
cfg->is_rss = AQ_CFG_IS_RSS_DEF;
cfg->num_rss_queues = AQ_CFG_NUM_RSS_QUEUES_DEF;
cfg->aq_rss.base_cpu_number = AQ_CFG_RSS_BASE_CPU_NUM_DEF;

View file

@ -31,6 +31,7 @@ struct aq_nic_cfg_s {
u32 itr;
u16 rx_itr;
u16 tx_itr;
u32 rxpageorder;
u32 num_rss_queues;
u32 mtu;
u32 flow_control;

View file

@ -12,10 +12,89 @@
#include "aq_ring.h"
#include "aq_nic.h"
#include "aq_hw.h"
#include "aq_hw_utils.h"
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
static inline void aq_free_rxpage(struct aq_rxpage *rxpage, struct device *dev)
{
unsigned int len = PAGE_SIZE << rxpage->order;
dma_unmap_page(dev, rxpage->daddr, len, DMA_FROM_DEVICE);
/* Drop the ref for being in the ring. */
__free_pages(rxpage->page, rxpage->order);
rxpage->page = NULL;
}
static int aq_get_rxpage(struct aq_rxpage *rxpage, unsigned int order,
struct device *dev)
{
struct page *page;
dma_addr_t daddr;
int ret = -ENOMEM;
page = dev_alloc_pages(order);
if (unlikely(!page))
goto err_exit;
daddr = dma_map_page(dev, page, 0, PAGE_SIZE << order,
DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(dev, daddr)))
goto free_page;
rxpage->page = page;
rxpage->daddr = daddr;
rxpage->order = order;
rxpage->pg_off = 0;
return 0;
free_page:
__free_pages(page, order);
err_exit:
return ret;
}
static int aq_get_rxpages(struct aq_ring_s *self, struct aq_ring_buff_s *rxbuf,
int order)
{
int ret;
if (rxbuf->rxdata.page) {
/* One means ring is the only user and can reuse */
if (page_ref_count(rxbuf->rxdata.page) > 1) {
/* Try reuse buffer */
rxbuf->rxdata.pg_off += AQ_CFG_RX_FRAME_MAX;
if (rxbuf->rxdata.pg_off + AQ_CFG_RX_FRAME_MAX <=
(PAGE_SIZE << order)) {
self->stats.rx.pg_flips++;
} else {
/* Buffer exhausted. We have other users and
* should release this page and realloc
*/
aq_free_rxpage(&rxbuf->rxdata,
aq_nic_get_dev(self->aq_nic));
self->stats.rx.pg_losts++;
}
} else {
rxbuf->rxdata.pg_off = 0;
self->stats.rx.pg_reuses++;
}
}
if (!rxbuf->rxdata.page) {
ret = aq_get_rxpage(&rxbuf->rxdata, order,
aq_nic_get_dev(self->aq_nic));
return ret;
}
return 0;
}
static struct aq_ring_s *aq_ring_alloc(struct aq_ring_s *self,
struct aq_nic_s *aq_nic)
{
@ -81,6 +160,11 @@ struct aq_ring_s *aq_ring_rx_alloc(struct aq_ring_s *self,
self->idx = idx;
self->size = aq_nic_cfg->rxds;
self->dx_size = aq_nic_cfg->aq_hw_caps->rxd_size;
self->page_order = fls(AQ_CFG_RX_FRAME_MAX / PAGE_SIZE +
(AQ_CFG_RX_FRAME_MAX % PAGE_SIZE ? 1 : 0)) - 1;
if (aq_nic_cfg->rxpageorder > self->page_order)
self->page_order = aq_nic_cfg->rxpageorder;
self = aq_ring_alloc(self, aq_nic);
if (!self) {
@ -201,22 +285,21 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
int budget)
{
struct net_device *ndev = aq_nic_get_ndev(self->aq_nic);
int err = 0;
bool is_rsc_completed = true;
int err = 0;
for (; (self->sw_head != self->hw_head) && budget;
self->sw_head = aq_ring_next_dx(self, self->sw_head),
--budget, ++(*work_done)) {
struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];
struct aq_ring_buff_s *buff_ = NULL;
struct sk_buff *skb = NULL;
unsigned int next_ = 0U;
unsigned int i = 0U;
struct aq_ring_buff_s *buff_ = NULL;
u16 hdr_len;
if (buff->is_error) {
__free_pages(buff->page, 0);
if (buff->is_error)
continue;
}
if (buff->is_cleaned)
continue;
@ -246,45 +329,66 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
}
}
dma_sync_single_range_for_cpu(aq_nic_get_dev(self->aq_nic),
buff->rxdata.daddr,
buff->rxdata.pg_off,
buff->len, DMA_FROM_DEVICE);
/* for single fragment packets use build_skb() */
if (buff->is_eop &&
buff->len <= AQ_CFG_RX_FRAME_MAX - AQ_SKB_ALIGN) {
skb = build_skb(page_address(buff->page),
skb = build_skb(aq_buf_vaddr(&buff->rxdata),
AQ_CFG_RX_FRAME_MAX);
if (unlikely(!skb)) {
err = -ENOMEM;
goto err_exit;
}
skb_put(skb, buff->len);
page_ref_inc(buff->rxdata.page);
} else {
skb = netdev_alloc_skb(ndev, ETH_HLEN);
skb = napi_alloc_skb(napi, AQ_CFG_RX_HDR_SIZE);
if (unlikely(!skb)) {
err = -ENOMEM;
goto err_exit;
}
skb_put(skb, ETH_HLEN);
memcpy(skb->data, page_address(buff->page), ETH_HLEN);
skb_add_rx_frag(skb, 0, buff->page, ETH_HLEN,
buff->len - ETH_HLEN,
SKB_TRUESIZE(buff->len - ETH_HLEN));
hdr_len = buff->len;
if (hdr_len > AQ_CFG_RX_HDR_SIZE)
hdr_len = eth_get_headlen(aq_buf_vaddr(&buff->rxdata),
AQ_CFG_RX_HDR_SIZE);
memcpy(__skb_put(skb, hdr_len), aq_buf_vaddr(&buff->rxdata),
ALIGN(hdr_len, sizeof(long)));
if (buff->len - hdr_len > 0) {
skb_add_rx_frag(skb, 0, buff->rxdata.page,
buff->rxdata.pg_off + hdr_len,
buff->len - hdr_len,
AQ_CFG_RX_FRAME_MAX);
page_ref_inc(buff->rxdata.page);
}
if (!buff->is_eop) {
for (i = 1U, next_ = buff->next,
buff_ = &self->buff_ring[next_];
true; next_ = buff_->next,
buff_ = &self->buff_ring[next_], ++i) {
skb_add_rx_frag(skb, i,
buff_->page, 0,
buff_->len,
SKB_TRUESIZE(buff->len -
ETH_HLEN));
buff_->is_cleaned = 1;
buff_ = buff;
i = 1U;
do {
next_ = buff_->next,
buff_ = &self->buff_ring[next_];
if (buff_->is_eop)
break;
}
dma_sync_single_range_for_cpu(
aq_nic_get_dev(self->aq_nic),
buff_->rxdata.daddr,
buff_->rxdata.pg_off,
buff_->len,
DMA_FROM_DEVICE);
skb_add_rx_frag(skb, i++,
buff_->rxdata.page,
buff_->rxdata.pg_off,
buff_->len,
AQ_CFG_RX_FRAME_MAX);
page_ref_inc(buff_->rxdata.page);
buff_->is_cleaned = 1;
} while (!buff_->is_eop);
}
}
@ -310,12 +414,15 @@ int aq_ring_rx_clean(struct aq_ring_s *self,
int aq_ring_rx_fill(struct aq_ring_s *self)
{
unsigned int pages_order = fls(AQ_CFG_RX_FRAME_MAX / PAGE_SIZE +
(AQ_CFG_RX_FRAME_MAX % PAGE_SIZE ? 1 : 0)) - 1;
unsigned int page_order = self->page_order;
struct aq_ring_buff_s *buff = NULL;
int err = 0;
int i = 0;
if (aq_ring_avail_dx(self) < min_t(unsigned int, AQ_CFG_RX_REFILL_THRES,
self->size / 2))
return err;
for (i = aq_ring_avail_dx(self); i--;
self->sw_tail = aq_ring_next_dx(self, self->sw_tail)) {
buff = &self->buff_ring[self->sw_tail];
@ -323,30 +430,15 @@ int aq_ring_rx_fill(struct aq_ring_s *self)
buff->flags = 0U;
buff->len = AQ_CFG_RX_FRAME_MAX;
buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order);
if (!buff->page) {
err = -ENOMEM;
err = aq_get_rxpages(self, buff, page_order);
if (err)
goto err_exit;
}
buff->pa = dma_map_page(aq_nic_get_dev(self->aq_nic),
buff->page, 0,
AQ_CFG_RX_FRAME_MAX, DMA_FROM_DEVICE);
if (dma_mapping_error(aq_nic_get_dev(self->aq_nic), buff->pa)) {
err = -ENOMEM;
goto err_exit;
}
buff->pa = aq_buf_daddr(&buff->rxdata);
buff = NULL;
}
err_exit:
if (err < 0) {
if (buff && buff->page)
__free_pages(buff->page, 0);
}
return err;
}
@ -359,10 +451,7 @@ void aq_ring_rx_deinit(struct aq_ring_s *self)
self->sw_head = aq_ring_next_dx(self, self->sw_head)) {
struct aq_ring_buff_s *buff = &self->buff_ring[self->sw_head];
dma_unmap_page(aq_nic_get_dev(self->aq_nic), buff->pa,
AQ_CFG_RX_FRAME_MAX, DMA_FROM_DEVICE);
__free_pages(buff->page, 0);
aq_free_rxpage(&buff->rxdata, aq_nic_get_dev(self->aq_nic));
}
err_exit:;

View file

@ -17,6 +17,13 @@
struct page;
struct aq_nic_cfg_s;
struct aq_rxpage {
struct page *page;
dma_addr_t daddr;
unsigned int order;
unsigned int pg_off;
};
/* TxC SOP DX EOP
* +----------+----------+----------+-----------
* 8bytes|len l3,l4 | pa | pa | pa
@ -31,28 +38,21 @@ struct aq_nic_cfg_s;
*/
struct __packed aq_ring_buff_s {
union {
/* RX/TX */
dma_addr_t pa;
/* RX */
struct {
u32 rss_hash;
u16 next;
u8 is_hash_l4;
u8 rsvd1;
struct page *page;
struct aq_rxpage rxdata;
};
/* EOP */
struct {
dma_addr_t pa_eop;
struct sk_buff *skb;
};
/* DX */
struct {
dma_addr_t pa;
};
/* SOP */
struct {
dma_addr_t pa_sop;
u32 len_pkt_sop;
};
/* TxC */
struct {
u32 mss;
@ -91,6 +91,9 @@ struct aq_ring_stats_rx_s {
u64 bytes;
u64 lro_packets;
u64 jumbo_packets;
u64 pg_losts;
u64 pg_flips;
u64 pg_reuses;
};
struct aq_ring_stats_tx_s {
@ -116,6 +119,7 @@ struct aq_ring_s {
unsigned int size; /* descriptors number */
unsigned int dx_size; /* TX or RX descriptor size, */
/* stored here for fater math */
unsigned int page_order;
union aq_ring_stats_s stats;
dma_addr_t dx_ring_pa;
};
@ -126,6 +130,16 @@ struct aq_ring_param_s {
cpumask_t affinity_mask;
};
static inline void *aq_buf_vaddr(struct aq_rxpage *rxpage)
{
return page_to_virt(rxpage->page) + rxpage->pg_off;
}
static inline dma_addr_t aq_buf_daddr(struct aq_rxpage *rxpage)
{
return rxpage->daddr + rxpage->pg_off;
}
static inline unsigned int aq_ring_next_dx(struct aq_ring_s *self,
unsigned int dx)
{

View file

@ -353,6 +353,9 @@ void aq_vec_add_stats(struct aq_vec_s *self,
stats_rx->errors += rx->errors;
stats_rx->jumbo_packets += rx->jumbo_packets;
stats_rx->lro_packets += rx->lro_packets;
stats_rx->pg_losts += rx->pg_losts;
stats_rx->pg_flips += rx->pg_flips;
stats_rx->pg_reuses += rx->pg_reuses;
stats_tx->packets += tx->packets;
stats_tx->bytes += tx->bytes;

View file

@ -619,8 +619,6 @@ static int hw_atl_a0_hw_ring_tx_head_update(struct aq_hw_s *self,
static int hw_atl_a0_hw_ring_rx_receive(struct aq_hw_s *self,
struct aq_ring_s *ring)
{
struct device *ndev = aq_nic_get_dev(ring->aq_nic);
for (; ring->hw_head != ring->sw_tail;
ring->hw_head = aq_ring_next_dx(ring, ring->hw_head)) {
struct aq_ring_buff_s *buff = NULL;
@ -687,8 +685,6 @@ static int hw_atl_a0_hw_ring_rx_receive(struct aq_hw_s *self,
is_err &= ~0x18U;
is_err &= ~0x04U;
dma_unmap_page(ndev, buff->pa, buff->len, DMA_FROM_DEVICE);
if (is_err || rxd_wb->type & 0x1000U) {
/* status error or DMA error */
buff->is_error = 1U;

View file

@ -259,7 +259,13 @@ static int hw_atl_b0_hw_offload_set(struct aq_hw_s *self,
hw_atl_rpo_lro_time_base_divider_set(self, 0x61AU);
hw_atl_rpo_lro_inactive_interval_set(self, 0);
hw_atl_rpo_lro_max_coalescing_interval_set(self, 2);
/* the LRO timebase divider is 5 uS (0x61a),
* which is multiplied by 50(0x32)
* to get a maximum coalescing interval of 250 uS,
* which is the default value
*/
hw_atl_rpo_lro_max_coalescing_interval_set(self, 50);
hw_atl_rpo_lro_qsessions_lim_set(self, 1U);
@ -273,6 +279,10 @@ static int hw_atl_b0_hw_offload_set(struct aq_hw_s *self,
hw_atl_rpo_lro_en_set(self,
aq_nic_cfg->is_lro ? 0xFFFFFFFFU : 0U);
hw_atl_itr_rsc_en_set(self,
aq_nic_cfg->is_lro ? 0xFFFFFFFFU : 0U);
hw_atl_itr_rsc_delay_set(self, 1U);
}
return aq_hw_err_from_flags(self);
}
@ -654,8 +664,6 @@ static int hw_atl_b0_hw_ring_tx_head_update(struct aq_hw_s *self,
static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self,
struct aq_ring_s *ring)
{
struct device *ndev = aq_nic_get_dev(ring->aq_nic);
for (; ring->hw_head != ring->sw_tail;
ring->hw_head = aq_ring_next_dx(ring, ring->hw_head)) {
struct aq_ring_buff_s *buff = NULL;
@ -697,8 +705,6 @@ static int hw_atl_b0_hw_ring_rx_receive(struct aq_hw_s *self,
buff->is_cso_err = 0U;
}
dma_unmap_page(ndev, buff->pa, buff->len, DMA_FROM_DEVICE);
if ((rx_stat & BIT(0)) || rxd_wb->type & 0x1000U) {
/* MAC error or DMA error */
buff->is_error = 1U;

View file

@ -78,7 +78,7 @@
#define HW_ATL_B0_TC_MAX 1U
#define HW_ATL_B0_RSS_MAX 8U
#define HW_ATL_B0_LRO_RXD_MAX 2U
#define HW_ATL_B0_LRO_RXD_MAX 16U
#define HW_ATL_B0_RS_SLIP_ENABLED 0U
/* (256k -1(max pay_len) - 54(header)) */

View file

@ -315,6 +315,21 @@ void hw_atl_itr_res_irq_set(struct aq_hw_s *aq_hw, u32 res_irq)
HW_ATL_ITR_RES_SHIFT, res_irq);
}
/* set RSC interrupt */
void hw_atl_itr_rsc_en_set(struct aq_hw_s *aq_hw, u32 enable)
{
aq_hw_write_reg(aq_hw, HW_ATL_ITR_RSC_EN_ADR, enable);
}
/* set RSC delay */
void hw_atl_itr_rsc_delay_set(struct aq_hw_s *aq_hw, u32 delay)
{
aq_hw_write_reg_bit(aq_hw, HW_ATL_ITR_RSC_DELAY_ADR,
HW_ATL_ITR_RSC_DELAY_MSK,
HW_ATL_ITR_RSC_DELAY_SHIFT,
delay);
}
/* rdm */
void hw_atl_rdm_cpu_id_set(struct aq_hw_s *aq_hw, u32 cpuid, u32 dca)
{

View file

@ -152,6 +152,12 @@ u32 hw_atl_itr_res_irq_get(struct aq_hw_s *aq_hw);
/* set reset interrupt */
void hw_atl_itr_res_irq_set(struct aq_hw_s *aq_hw, u32 res_irq);
/* set RSC interrupt */
void hw_atl_itr_rsc_en_set(struct aq_hw_s *aq_hw, u32 enable);
/* set RSC delay */
void hw_atl_itr_rsc_delay_set(struct aq_hw_s *aq_hw, u32 delay);
/* rdm */
/* set cpu id */

View file

@ -95,6 +95,19 @@
#define HW_ATL_ITR_RES_MSK 0x80000000
/* lower bit position of bitfield itr_reset */
#define HW_ATL_ITR_RES_SHIFT 31
/* register address for bitfield rsc_en */
#define HW_ATL_ITR_RSC_EN_ADR 0x00002200
/* register address for bitfield rsc_delay */
#define HW_ATL_ITR_RSC_DELAY_ADR 0x00002204
/* bitmask for bitfield rsc_delay */
#define HW_ATL_ITR_RSC_DELAY_MSK 0x0000000f
/* width of bitfield rsc_delay */
#define HW_ATL_ITR_RSC_DELAY_WIDTH 4
/* lower bit position of bitfield rsc_delay */
#define HW_ATL_ITR_RSC_DELAY_SHIFT 0
/* register address for bitfield dca{d}_cpuid[7:0] */
#define HW_ATL_RDM_DCADCPUID_ADR(dca) (0x00006100 + (dca) * 0x4)
/* bitmask for bitfield dca{d}_cpuid[7:0] */