pasemi_mac: further performance tweaks

pasemi_mac: further performance tweaks

Misc driver tweaks for pasemi_mac:
	* Increase ring size (really needed mostly on 10G)
	* Take out an unneeded barrier
	* Move around a few prefetches and reorder a few calls
	* Don't try to clean on full tx buffer, just let things
	  take their course and stop the queue directly
	* Avoid filling on the same line as the interface is
	  working on to reduce cache line bouncing
	* Avoid unneeded clearing of software state (and make the
	  interface shutdown code handle it)
	* Fix up some of the tx ring wrap logic.

Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
This commit is contained in:
Olof Johansson 2007-10-02 16:27:15 -05:00 committed by David S. Miller
parent 8dc121a4b6
commit ad5da10a64
1 changed files with 47 additions and 45 deletions

View File

@ -56,8 +56,8 @@
/* Must be a power of two */ /* Must be a power of two */
#define RX_RING_SIZE 512 #define RX_RING_SIZE 4096
#define TX_RING_SIZE 512 #define TX_RING_SIZE 4096
#define DEFAULT_MSG_ENABLE \ #define DEFAULT_MSG_ENABLE \
(NETIF_MSG_DRV | \ (NETIF_MSG_DRV | \
@ -336,8 +336,16 @@ static void pasemi_mac_free_tx_resources(struct net_device *dev)
struct pasemi_mac_buffer *info; struct pasemi_mac_buffer *info;
dma_addr_t dmas[MAX_SKB_FRAGS+1]; dma_addr_t dmas[MAX_SKB_FRAGS+1];
int freed; int freed;
int start, limit;
for (i = 0; i < TX_RING_SIZE; i += freed) { start = mac->tx->next_to_clean;
limit = mac->tx->next_to_fill;
/* Compensate for when fill has wrapped and clean has not */
if (start > limit)
limit += TX_RING_SIZE;
for (i = start; i < limit; i += freed) {
info = &TX_RING_INFO(mac, i+1); info = &TX_RING_INFO(mac, i+1);
if (info->dma && info->skb) { if (info->dma && info->skb) {
for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++) for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++)
@ -520,9 +528,6 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit)
n = mac->rx->next_to_clean; n = mac->rx->next_to_clean;
for (count = limit; count; count--) { for (count = limit; count; count--) {
rmb();
macrx = RX_RING(mac, n); macrx = RX_RING(mac, n);
if ((macrx & XCT_MACRX_E) || if ((macrx & XCT_MACRX_E) ||
@ -550,14 +555,10 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit)
break; break;
} }
prefetchw(info);
skb = info->skb; skb = info->skb;
prefetchw(skb);
info->dma = 0;
pci_unmap_single(mac->dma_pdev, dma, skb->len, prefetch(skb);
PCI_DMA_FROMDEVICE); prefetch(&skb->data_len);
len = (macrx & XCT_MACRX_LLEN_M) >> XCT_MACRX_LLEN_S; len = (macrx & XCT_MACRX_LLEN_M) >> XCT_MACRX_LLEN_S;
@ -576,10 +577,9 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit)
} else } else
info->skb = NULL; info->skb = NULL;
/* Need to zero it out since hardware doesn't, since the pci_unmap_single(mac->dma_pdev, dma, len, PCI_DMA_FROMDEVICE);
* replenish loop uses it to tell when it's done.
*/ info->dma = 0;
RX_BUFF(mac, i) = 0;
skb_put(skb, len); skb_put(skb, len);
@ -599,6 +599,11 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit)
RX_RING(mac, n) = 0; RX_RING(mac, n) = 0;
RX_RING(mac, n+1) = 0; RX_RING(mac, n+1) = 0;
/* Need to zero it out since hardware doesn't, since the
* replenish loop uses it to tell when it's done.
*/
RX_BUFF(mac, i) = 0;
n += 2; n += 2;
} }
@ -621,27 +626,33 @@ static int pasemi_mac_clean_rx(struct pasemi_mac *mac, int limit)
static int pasemi_mac_clean_tx(struct pasemi_mac *mac) static int pasemi_mac_clean_tx(struct pasemi_mac *mac)
{ {
int i, j; int i, j;
struct pasemi_mac_buffer *info; unsigned int start, descr_count, buf_count, batch_limit;
unsigned int start, descr_count, buf_count, limit; unsigned int ring_limit;
unsigned int total_count; unsigned int total_count;
unsigned long flags; unsigned long flags;
struct sk_buff *skbs[TX_CLEAN_BATCHSIZE]; struct sk_buff *skbs[TX_CLEAN_BATCHSIZE];
dma_addr_t dmas[TX_CLEAN_BATCHSIZE][MAX_SKB_FRAGS+1]; dma_addr_t dmas[TX_CLEAN_BATCHSIZE][MAX_SKB_FRAGS+1];
total_count = 0; total_count = 0;
limit = TX_CLEAN_BATCHSIZE; batch_limit = TX_CLEAN_BATCHSIZE;
restart: restart:
spin_lock_irqsave(&mac->tx->lock, flags); spin_lock_irqsave(&mac->tx->lock, flags);
start = mac->tx->next_to_clean; start = mac->tx->next_to_clean;
ring_limit = mac->tx->next_to_fill;
/* Compensate for when fill has wrapped but clean has not */
if (start > ring_limit)
ring_limit += TX_RING_SIZE;
buf_count = 0; buf_count = 0;
descr_count = 0; descr_count = 0;
for (i = start; for (i = start;
descr_count < limit && i < mac->tx->next_to_fill; descr_count < batch_limit && i < ring_limit;
i += buf_count) { i += buf_count) {
u64 mactx = TX_RING(mac, i); u64 mactx = TX_RING(mac, i);
struct sk_buff *skb;
if ((mactx & XCT_MACTX_E) || if ((mactx & XCT_MACTX_E) ||
(*mac->tx_status & PAS_STATUS_ERROR)) (*mac->tx_status & PAS_STATUS_ERROR))
@ -651,19 +662,15 @@ restart:
/* Not yet transmitted */ /* Not yet transmitted */
break; break;
info = &TX_RING_INFO(mac, i+1); skb = TX_RING_INFO(mac, i+1).skb;
skbs[descr_count] = info->skb; skbs[descr_count] = skb;
buf_count = 2 + skb_shinfo(info->skb)->nr_frags; buf_count = 2 + skb_shinfo(skb)->nr_frags;
for (j = 0; j <= skb_shinfo(info->skb)->nr_frags; j++) for (j = 0; j <= skb_shinfo(skb)->nr_frags; j++)
dmas[descr_count][j] = TX_RING_INFO(mac, i+1+j).dma; dmas[descr_count][j] = TX_RING_INFO(mac, i+1+j).dma;
info->dma = 0;
TX_RING(mac, i) = 0; TX_RING(mac, i) = 0;
TX_RING(mac, i+1) = 0; TX_RING(mac, i+1) = 0;
TX_RING_INFO(mac, i+1).skb = 0;
TX_RING_INFO(mac, i+1).dma = 0;
/* Since we always fill with an even number of entries, make /* Since we always fill with an even number of entries, make
* sure we skip any unused one at the end as well. * sure we skip any unused one at the end as well.
@ -672,7 +679,7 @@ restart:
buf_count++; buf_count++;
descr_count++; descr_count++;
} }
mac->tx->next_to_clean = i; mac->tx->next_to_clean = i & (TX_RING_SIZE-1);
spin_unlock_irqrestore(&mac->tx->lock, flags); spin_unlock_irqrestore(&mac->tx->lock, flags);
netif_wake_queue(mac->netdev); netif_wake_queue(mac->netdev);
@ -683,7 +690,7 @@ restart:
total_count += descr_count; total_count += descr_count;
/* If the batch was full, try to clean more */ /* If the batch was full, try to clean more */
if (descr_count == limit) if (descr_count == batch_limit)
goto restart; goto restart;
return total_count; return total_count;
@ -1106,19 +1113,14 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
spin_lock_irqsave(&txring->lock, flags); spin_lock_irqsave(&txring->lock, flags);
if (RING_AVAIL(txring) <= nfrags+3) { /* Avoid stepping on the same cache line that the DMA controller
spin_unlock_irqrestore(&txring->lock, flags); * is currently about to send, so leave at least 8 words available.
pasemi_mac_clean_tx(mac); * Total free space needed is mactx + fragments + 8
pasemi_mac_restart_tx_intr(mac); */
spin_lock_irqsave(&txring->lock, flags); if (RING_AVAIL(txring) < nfrags + 10) {
/* no room -- stop the queue and wait for tx intr */
if (RING_AVAIL(txring) <= nfrags+3) { netif_stop_queue(dev);
/* Still no room -- stop the queue and wait for tx goto out_err;
* intr when there's room.
*/
netif_stop_queue(dev);
goto out_err;
}
} }
TX_RING(mac, txring->next_to_fill) = mactx; TX_RING(mac, txring->next_to_fill) = mactx;
@ -1137,8 +1139,8 @@ static int pasemi_mac_start_tx(struct sk_buff *skb, struct net_device *dev)
if (nfrags & 1) if (nfrags & 1)
nfrags++; nfrags++;
txring->next_to_fill += nfrags + 1; txring->next_to_fill = (txring->next_to_fill + nfrags + 1) &
(TX_RING_SIZE-1);
dev->stats.tx_packets++; dev->stats.tx_packets++;
dev->stats.tx_bytes += skb->len; dev->stats.tx_bytes += skb->len;