sfc: Add SR-IOV back-end support for SFC9000 family

On the SFC9000 family, each port has 1024 Virtual Interfaces (VIs), each with an RX queue, a TX queue, an event queue and a mailbox register. These may be assigned to up to 127 SR-IOV virtual functions per port, with up to 64 VIs per VF. We allocate an extra channel (IRQ and event queue only) to receive requests from VF drivers. There is a per-port limit of 4 concurrent RX queue flushes, and queue flushes may be initiated by the MC in response to a Function Level Reset (FLR) of a VF. Therefore, when SR-IOV is in use, we submit all flush requests via the MC. The RSS indirection table is shared with VFs, so the number of RX queues used in the PF is limited to the number of VIs per VF. This is almost entirely the work of Steve Hodgson, formerly shodgson@solarflare.com. Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
2012-02-14 00:48:07 +00:00 · 2012-02-14 00:48:07 +00:00 · cd2d5b529c
parent 28e47c498a
commit cd2d5b529c
13 changed files with 2194 additions and 28 deletions
--- a/drivers/net/ethernet/sfc/Kconfig
+++ b/drivers/net/ethernet/sfc/Kconfig
@ -26,3 +26,11 @@ config SFC_MCDI_MON
 	----help---
 	  This exposes the on-board firmware-managed sensors as a
 	  hardware monitor device.
+config SFC_SRIOV
+	bool "Solarflare SFC9000-family SR-IOV support"
+	depends on SFC && PCI_IOV
+	default y
+	---help---
+	  This enables support for the SFC9000 I/O Virtualization
+	  features, allowing accelerated network performance in
+	  virtualized environments.
--- a/drivers/net/ethernet/sfc/Makefile
+++ b/drivers/net/ethernet/sfc/Makefile
@ -4,5 +4,6 @@ sfc-y			+= efx.o nic.o falcon.o siena.o tx.o rx.o filter.o \
 			   tenxpress.o txc43128_phy.o falcon_boards.o \
 			   mcdi.o mcdi_phy.o mcdi_mon.o
 sfc-$(CONFIG_SFC_MTD)	+= mtd.o
+sfc-$(CONFIG_SFC_SRIOV)	+= siena_sriov.o

 obj-$(CONFIG_SFC)	+= sfc.o
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@ -1175,25 +1175,40 @@ static unsigned int efx_wanted_parallelism(struct efx_nic *efx)
 	unsigned int count;
 	int cpu;

-	if (rss_cpus)
-		return rss_cpus;
-
-	if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) {
-		netif_warn(efx, probe, efx->net_dev,
-			   "RSS disabled due to allocation failure\n");
-		return 1;
-	}
-
-	count = 0;
-	for_each_online_cpu(cpu) {
-		if (!cpumask_test_cpu(cpu, thread_mask)) {
-			++count;
-			cpumask_or(thread_mask, thread_mask,
-				   topology_thread_cpumask(cpu));
+	if (rss_cpus) {
+		count = rss_cpus;
+	} else {
+		if (unlikely(!zalloc_cpumask_var(&thread_mask, GFP_KERNEL))) {
+			netif_warn(efx, probe, efx->net_dev,
+				   "RSS disabled due to allocation failure\n");
+			return 1;
 		}
+
+		count = 0;
+		for_each_online_cpu(cpu) {
+			if (!cpumask_test_cpu(cpu, thread_mask)) {
+				++count;
+				cpumask_or(thread_mask, thread_mask,
+					   topology_thread_cpumask(cpu));
+			}
+		}
+
+		free_cpumask_var(thread_mask);
+	}
+
+	/* If RSS is requested for the PF *and* VFs then we can't write RSS
+	 * table entries that are inaccessible to VFs
+	 */
+	if (efx_sriov_wanted(efx) && efx_vf_size(efx) > 1 &&
+	    count > efx_vf_size(efx)) {
+		netif_warn(efx, probe, efx->net_dev,
+			   "Reducing number of RSS channels from %u to %u for "
+			   "VF support. Increase vf-msix-limit to use more "
+			   "channels on the PF.\n",
+			   count, efx_vf_size(efx));
+		count = efx_vf_size(efx);
 	}

-	free_cpumask_var(thread_mask);
 	return count;
 }

@ -1327,6 +1342,10 @@ static int efx_probe_interrupts(struct efx_nic *efx)
 		}
 	}

+	/* RSS might be usable on VFs even if it is disabled on the PF */
+	efx->rss_spread = (efx->n_rx_channels > 1 ?
+			   efx->n_rx_channels : efx_vf_size(efx));
+
 	return 0;
 }

@ -1426,7 +1445,7 @@ static int efx_probe_nic(struct efx_nic *efx)
 		get_random_bytes(&efx->rx_hash_key, sizeof(efx->rx_hash_key));
 	for (i = 0; i < ARRAY_SIZE(efx->rx_indir_table); i++)
 		efx->rx_indir_table[i] =
-			ethtool_rxfh_indir_default(i, efx->n_rx_channels);
+			ethtool_rxfh_indir_default(i, efx->rss_spread);

 	efx_set_channels(efx);
 	netif_set_real_num_tx_queues(efx->net_dev, efx->n_tx_channels);
@ -1915,6 +1934,7 @@ static int efx_set_mac_address(struct net_device *net_dev, void *data)
 	}

 	memcpy(net_dev->dev_addr, new_addr, net_dev->addr_len);
+	efx_sriov_mac_address_changed(efx);

 	/* Reconfigure the MAC */
 	mutex_lock(&efx->mac_lock);
@ -1981,6 +2001,12 @@ static const struct net_device_ops efx_netdev_ops = {
 	.ndo_set_mac_address	= efx_set_mac_address,
 	.ndo_set_rx_mode	= efx_set_rx_mode,
 	.ndo_set_features	= efx_set_features,
+#ifdef CONFIG_SFC_SRIOV
+	.ndo_set_vf_mac		= efx_sriov_set_vf_mac,
+	.ndo_set_vf_vlan	= efx_sriov_set_vf_vlan,
+	.ndo_set_vf_spoofchk	= efx_sriov_set_vf_spoofchk,
+	.ndo_get_vf_config	= efx_sriov_get_vf_config,
+#endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller = efx_netpoll,
 #endif
@ -2150,6 +2176,7 @@ int efx_reset_up(struct efx_nic *efx, enum reset_type method, bool ok)

 	efx_start_interrupts(efx, false);
 	efx_restore_filters(efx);
+	efx_sriov_reset(efx);

 	mutex_unlock(&efx->mac_lock);

@ -2440,6 +2467,7 @@ static void efx_pci_remove(struct pci_dev *pci_dev)
 	rtnl_unlock();

 	efx_stop_interrupts(efx, false);
+	efx_sriov_fini(efx);
 	efx_unregister_netdev(efx);

 	efx_mtd_remove(efx);
@ -2581,6 +2609,11 @@ static int __devinit efx_pci_probe(struct pci_dev *pci_dev,
 	if (rc)
 		goto fail4;

+	rc = efx_sriov_init(efx);
+	if (rc)
+		netif_err(efx, probe, efx->net_dev,
+			  "SR-IOV can't be enabled rc %d\n", rc);
+
 	netif_dbg(efx, probe, efx->net_dev, "initialisation successful\n");

 	/* Try to create MTDs, but allow this to fail */
@ -2732,6 +2765,10 @@ static int __init efx_init_module(void)
 	if (rc)
 		goto err_notifier;

+	rc = efx_init_sriov();
+	if (rc)
+		goto err_sriov;
+
 	reset_workqueue = create_singlethread_workqueue("sfc_reset");
 	if (!reset_workqueue) {
 		rc = -ENOMEM;
@ -2747,6 +2784,8 @@ static int __init efx_init_module(void)
 err_pci:
 	destroy_workqueue(reset_workqueue);
 err_reset:
+	efx_fini_sriov();
+ err_sriov:
 	unregister_netdevice_notifier(&efx_netdev_notifier);
 err_notifier:
 	return rc;
@ -2758,6 +2797,7 @@ static void __exit efx_exit_module(void)

 	pci_unregister_driver(&efx_pci_driver);
 	destroy_workqueue(reset_workqueue);
+	efx_fini_sriov();
 	unregister_netdevice_notifier(&efx_netdev_notifier);

 }
--- a/drivers/net/ethernet/sfc/ethtool.c
+++ b/drivers/net/ethernet/sfc/ethtool.c
@ -1085,7 +1085,8 @@ static u32 efx_ethtool_get_rxfh_indir_size(struct net_device *net_dev)
 {
 	struct efx_nic *efx = netdev_priv(net_dev);

-	return (efx_nic_rev(efx) < EFX_REV_FALCON_B0 ?
+	return ((efx_nic_rev(efx) < EFX_REV_FALCON_B0 ||
+		 efx->n_rx_channels == 1) ?
 		0 : ARRAY_SIZE(efx->rx_indir_table));
 }

--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@ -560,6 +560,9 @@ void efx_mcdi_process_event(struct efx_channel *channel,
 	case MCDI_EVENT_CODE_MAC_STATS_DMA:
 		/* MAC stats are gather lazily.  We can ignore this. */
 		break;
+	case MCDI_EVENT_CODE_FLR:
+		efx_sriov_flr(efx, MCDI_EVENT_FIELD(*event, FLR_VF));
+		break;

 	default:
 		netif_err(efx, hw, efx->net_dev, "Unknown MCDI event 0x%x\n",
@ -1154,6 +1157,37 @@ fail:
 	return rc;
 }

+int efx_mcdi_flush_rxqs(struct efx_nic *efx)
+{
+	struct efx_channel *channel;
+	struct efx_rx_queue *rx_queue;
+	__le32 *qid;
+	int rc, count;
+
+	qid = kmalloc(EFX_MAX_CHANNELS * sizeof(*qid), GFP_KERNEL);
+	if (qid == NULL)
+		return -ENOMEM;
+
+	count = 0;
+	efx_for_each_channel(channel, efx) {
+		efx_for_each_channel_rx_queue(rx_queue, channel) {
+			if (rx_queue->flush_pending) {
+				rx_queue->flush_pending = false;
+				atomic_dec(&efx->rxq_flush_pending);
+				qid[count++] = cpu_to_le32(
+					efx_rx_queue_index(rx_queue));
+			}
+		}
+	}
+
+	rc = efx_mcdi_rpc(efx, MC_CMD_FLUSH_RX_QUEUES, (u8 *)qid,
+			  count * sizeof(*qid), NULL, 0, NULL);
+	WARN_ON(rc > 0);
+
+	kfree(qid);
+
+	return rc;
+}

 int efx_mcdi_wol_filter_reset(struct efx_nic *efx)
 {
--- a/drivers/net/ethernet/sfc/mcdi.h
+++ b/drivers/net/ethernet/sfc/mcdi.h
@ -146,6 +146,8 @@ extern int efx_mcdi_wol_filter_set_magic(struct efx_nic *efx,
 extern int efx_mcdi_wol_filter_get_magic(struct efx_nic *efx, int *id_out);
 extern int efx_mcdi_wol_filter_remove(struct efx_nic *efx, int id);
 extern int efx_mcdi_wol_filter_reset(struct efx_nic *efx);
+extern int efx_mcdi_flush_rxqs(struct efx_nic *efx);
+extern int efx_mcdi_set_mac(struct efx_nic *efx);
 extern int efx_mcdi_mac_stats(struct efx_nic *efx, dma_addr_t dma_addr,
 			      u32 dma_len, int enable, int clear);
 extern int efx_mcdi_mac_reconfigure(struct efx_nic *efx);
--- a/drivers/net/ethernet/sfc/mcdi_mac.c
+++ b/drivers/net/ethernet/sfc/mcdi_mac.c
@ -12,7 +12,7 @@
 #include "mcdi.h"
 #include "mcdi_pcol.h"

-static int efx_mcdi_set_mac(struct efx_nic *efx)
+int efx_mcdi_set_mac(struct efx_nic *efx)
 {
 	u32 reject, fcntl;
 	u8 cmdbytes[MC_CMD_SET_MAC_IN_LEN];
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@ -24,6 +24,7 @@
 #include <linux/device.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/i2c.h>

@ -54,7 +55,8 @@

 #define EFX_MAX_CHANNELS 32U
 #define EFX_MAX_RX_QUEUES EFX_MAX_CHANNELS
-#define EFX_MAX_EXTRA_CHANNELS	0U
+#define EFX_EXTRA_CHANNEL_IOV	0
+#define EFX_MAX_EXTRA_CHANNELS	1U

 /* Checksum generation is a per-queue option in hardware, so each
 * queue visible to the networking core is backed by two hardware TX
@ -629,6 +631,8 @@ union efx_multicast_hash {
 };

 struct efx_filter_state;
+struct efx_vf;
+struct vfdi_status;

 /**
 * struct efx_nic - an Efx NIC
@ -712,6 +716,17 @@ struct efx_filter_state;
 *	completed (either success or failure). Not used when MCDI is used to
 *	flush receive queues.
 * @flush_wq: wait queue used by efx_nic_flush_queues() to wait for flush completions.
+ * @vf: Array of &struct efx_vf objects.
+ * @vf_count: Number of VFs intended to be enabled.
+ * @vf_init_count: Number of VFs that have been fully initialised.
+ * @vi_scale: log2 number of vnics per VF.
+ * @vf_buftbl_base: The zeroth buffer table index used to back VF queues.
+ * @vfdi_status: Common VFDI status page to be dmad to VF address space.
+ * @local_addr_list: List of local addresses. Protected by %local_lock.
+ * @local_page_list: List of DMA addressable pages used to broadcast
+ *	%local_addr_list. Protected by %local_lock.
+ * @local_lock: Mutex protecting %local_addr_list and %local_page_list.
+ * @peer_work: Work item to broadcast peer addresses to VMs.
 * @monitor_work: Hardware monitor workitem
 * @biu_lock: BIU (bus interface unit) lock
 * @last_irq_cpu: Last CPU to handle a possible test interrupt.  This
@ -762,6 +777,7 @@ struct efx_nic {
 	unsigned next_buffer_table;
 	unsigned n_channels;
 	unsigned n_rx_channels;
+	unsigned rss_spread;
 	unsigned tx_channel_offset;
 	unsigned n_tx_channels;
 	unsigned int rx_buffer_len;
@ -820,6 +836,20 @@ struct efx_nic {
 	atomic_t rxq_flush_outstanding;
 	wait_queue_head_t flush_wq;

+#ifdef CONFIG_SFC_SRIOV
+	struct efx_channel *vfdi_channel;
+	struct efx_vf *vf;
+	unsigned vf_count;
+	unsigned vf_init_count;
+	unsigned vi_scale;
+	unsigned vf_buftbl_base;
+	struct efx_buffer vfdi_status;
+	struct list_head local_addr_list;
+	struct list_head local_page_list;
+	struct mutex local_lock;
+	struct work_struct peer_work;
+#endif
+
 	/* The following fields may be written more often */

 	struct delayed_work monitor_work ____cacheline_aligned_in_smp;
--- a/drivers/net/ethernet/sfc/nic.c
+++ b/drivers/net/ethernet/sfc/nic.c
@ -264,6 +264,10 @@ static int efx_alloc_special_buffer(struct efx_nic *efx,
 	/* Select new buffer ID */
 	buffer->index = efx->next_buffer_table;
 	efx->next_buffer_table += buffer->entries;
+#ifdef CONFIG_SFC_SRIOV
+	BUG_ON(efx_sriov_enabled(efx) &&
+	       efx->vf_buftbl_base < efx->next_buffer_table);
+#endif

 	netif_dbg(efx, probe, efx->net_dev,
 		  "allocating special buffers %d-%d at %llx+%x "
@ -693,6 +697,16 @@ int efx_nic_flush_queues(struct efx_nic *efx)
 	}

 	while (timeout && atomic_read(&efx->drain_pending) > 0) {
+		/* If SRIOV is enabled, then offload receive queue flushing to
+		 * the firmware (though we will still have to poll for
+		 * completion). If that fails, fall back to the old scheme.
+		 */
+		if (efx_sriov_enabled(efx)) {
+			rc = efx_mcdi_flush_rxqs(efx);
+			if (!rc)
+				goto wait;
+		}
+
 		/* The hardware supports four concurrent rx flushes, each of
 		 * which may need to be retried if there is an outstanding
 		 * descriptor fetch
@ -712,6 +726,7 @@ int efx_nic_flush_queues(struct efx_nic *efx)
 			}
 		}

+	wait:
 		timeout = wait_event_timeout(efx->flush_wq, efx_flush_wake(efx),
 					     timeout);
 	}
@ -1102,11 +1117,13 @@ efx_handle_driver_event(struct efx_channel *channel, efx_qword_t *event)
 		netif_vdbg(efx, hw, efx->net_dev, "channel %d TXQ %d flushed\n",
 			   channel->channel, ev_sub_data);
 		efx_handle_tx_flush_done(efx, event);
+		efx_sriov_tx_flush_done(efx, event);
 		break;
 	case FSE_AZ_RX_DESCQ_FLS_DONE_EV:
 		netif_vdbg(efx, hw, efx->net_dev, "channel %d RXQ %d flushed\n",
 			   channel->channel, ev_sub_data);
 		efx_handle_rx_flush_done(efx, event);
+		efx_sriov_rx_flush_done(efx, event);
 		break;
 	case FSE_AZ_EVQ_INIT_DONE_EV:
 		netif_dbg(efx, hw, efx->net_dev,
@ -1138,16 +1155,24 @@ efx_handle_driver_event(struct efx_channel *channel, efx_qword_t *event)
 				   RESET_TYPE_DISABLE);
 		break;
 	case FSE_BZ_RX_DSC_ERROR_EV:
-		netif_err(efx, rx_err, efx->net_dev,
-			  "RX DMA Q %d reports descriptor fetch error."
-			  " RX Q %d is disabled.\n", ev_sub_data, ev_sub_data);
-		efx_schedule_reset(efx, RESET_TYPE_RX_DESC_FETCH);
+		if (ev_sub_data < EFX_VI_BASE) {
+			netif_err(efx, rx_err, efx->net_dev,
+				  "RX DMA Q %d reports descriptor fetch error."
+				  " RX Q %d is disabled.\n", ev_sub_data,
+				  ev_sub_data);
+			efx_schedule_reset(efx, RESET_TYPE_RX_DESC_FETCH);
+		} else
+			efx_sriov_desc_fetch_err(efx, ev_sub_data);
 		break;
 	case FSE_BZ_TX_DSC_ERROR_EV:
-		netif_err(efx, tx_err, efx->net_dev,
-			  "TX DMA Q %d reports descriptor fetch error."
-			  " TX Q %d is disabled.\n", ev_sub_data, ev_sub_data);
-		efx_schedule_reset(efx, RESET_TYPE_TX_DESC_FETCH);
+		if (ev_sub_data < EFX_VI_BASE) {
+			netif_err(efx, tx_err, efx->net_dev,
+				  "TX DMA Q %d reports descriptor fetch error."
+				  " TX Q %d is disabled.\n", ev_sub_data,
+				  ev_sub_data);
+			efx_schedule_reset(efx, RESET_TYPE_TX_DESC_FETCH);
+		} else
+			efx_sriov_desc_fetch_err(efx, ev_sub_data);
 		break;
 	default:
 		netif_vdbg(efx, hw, efx->net_dev,
@ -1207,6 +1232,9 @@ int efx_nic_process_eventq(struct efx_channel *channel, int budget)
 		case FSE_AZ_EV_CODE_DRIVER_EV:
 			efx_handle_driver_event(channel, &event);
 			break;
+		case FSE_CZ_EV_CODE_USER_EV:
+			efx_sriov_event(channel, &event);
+			break;
 		case FSE_CZ_EV_CODE_MCDI_EV:
 			efx_mcdi_process_event(channel, &event);
 			break;
@ -1609,6 +1637,15 @@ void efx_nic_fini_interrupt(struct efx_nic *efx)
 		free_irq(efx->legacy_irq, efx);
 }

+/* Looks at available SRAM resources and works out how many queues we
+ * can support, and where things like descriptor caches should live.
+ *
+ * SRAM is split up as follows:
+ * 0                          buftbl entries for channels
+ * efx->vf_buftbl_base        buftbl entries for SR-IOV
+ * efx->rx_dc_base            RX descriptor caches
+ * efx->tx_dc_base            TX descriptor caches
+ */
 void efx_nic_dimension_resources(struct efx_nic *efx, unsigned sram_lim_qw)
 {
 	unsigned vi_count, buftbl_min;
@ -1622,6 +1659,32 @@ void efx_nic_dimension_resources(struct efx_nic *efx, unsigned sram_lim_qw)
 		      * sizeof(efx_qword_t) / EFX_BUF_SIZE);
 	vi_count = max(efx->n_channels, efx->n_tx_channels * EFX_TXQ_TYPES);

+#ifdef CONFIG_SFC_SRIOV
+	if (efx_sriov_wanted(efx)) {
+		unsigned vi_dc_entries, buftbl_free, entries_per_vf, vf_limit;
+
+		efx->vf_buftbl_base = buftbl_min;
+
+		vi_dc_entries = RX_DC_ENTRIES + TX_DC_ENTRIES;
+		vi_count = max(vi_count, EFX_VI_BASE);
+		buftbl_free = (sram_lim_qw - buftbl_min -
+			       vi_count * vi_dc_entries);
+
+		entries_per_vf = ((vi_dc_entries + EFX_VF_BUFTBL_PER_VI) *
+				  efx_vf_size(efx));
+		vf_limit = min(buftbl_free / entries_per_vf,
+			       (1024U - EFX_VI_BASE) >> efx->vi_scale);
+
+		if (efx->vf_count > vf_limit) {
+			netif_err(efx, probe, efx->net_dev,
+				  "Reducing VF count from from %d to %d\n",
+				  efx->vf_count, vf_limit);
+			efx->vf_count = vf_limit;
+		}
+		vi_count += efx->vf_count * efx_vf_size(efx);
+	}
+#endif
+
 	efx->tx_dc_base = sram_lim_qw - vi_count * TX_DC_ENTRIES;
 	efx->rx_dc_base = efx->tx_dc_base - vi_count * RX_DC_ENTRIES;
 }
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@ -169,6 +169,95 @@ static inline struct efx_mcdi_mon *efx_mcdi_mon(struct efx_nic *efx)
 }
 #endif

+/*
+ * On the SFC9000 family each port is associated with 1 PCI physical
+ * function (PF) handled by sfc and a configurable number of virtual
+ * functions (VFs) that may be handled by some other driver, often in
+ * a VM guest.  The queue pointer registers are mapped in both PF and
+ * VF BARs such that an 8K region provides access to a single RX, TX
+ * and event queue (collectively a Virtual Interface, VI or VNIC).
+ *
+ * The PF has access to all 1024 VIs while VFs are mapped to VIs
+ * according to VI_BASE and VI_SCALE: VF i has access to VIs numbered
+ * in range [VI_BASE + i << VI_SCALE, VI_BASE + i + 1 << VI_SCALE).
+ * The number of VIs and the VI_SCALE value are configurable but must
+ * be established at boot time by firmware.
+ */
+
+/* Maximum VI_SCALE parameter supported by Siena */
+#define EFX_VI_SCALE_MAX 6
+/* Base VI to use for SR-IOV. Must be aligned to (1 << EFX_VI_SCALE_MAX),
+ * so this is the smallest allowed value. */
+#define EFX_VI_BASE 128U
+/* Maximum number of VFs allowed */
+#define EFX_VF_COUNT_MAX 127
+/* Limit EVQs on VFs to be only 8k to reduce buffer table reservation */
+#define EFX_MAX_VF_EVQ_SIZE 8192UL
+/* The number of buffer table entries reserved for each VI on a VF */
+#define EFX_VF_BUFTBL_PER_VI					\
+	((EFX_MAX_VF_EVQ_SIZE + 2 * EFX_MAX_DMAQ_SIZE) *	\
+	 sizeof(efx_qword_t) / EFX_BUF_SIZE)
+
+#ifdef CONFIG_SFC_SRIOV
+
+static inline bool efx_sriov_wanted(struct efx_nic *efx)
+{
+	return efx->vf_count != 0;
+}
+static inline bool efx_sriov_enabled(struct efx_nic *efx)
+{
+	return efx->vf_init_count != 0;
+}
+static inline unsigned int efx_vf_size(struct efx_nic *efx)
+{
+	return 1 << efx->vi_scale;
+}
+
+extern int efx_init_sriov(void);
+extern void efx_sriov_probe(struct efx_nic *efx);
+extern int efx_sriov_init(struct efx_nic *efx);
+extern void efx_sriov_mac_address_changed(struct efx_nic *efx);
+extern void efx_sriov_tx_flush_done(struct efx_nic *efx, efx_qword_t *event);
+extern void efx_sriov_rx_flush_done(struct efx_nic *efx, efx_qword_t *event);
+extern void efx_sriov_event(struct efx_channel *channel, efx_qword_t *event);
+extern void efx_sriov_desc_fetch_err(struct efx_nic *efx, unsigned dmaq);
+extern void efx_sriov_flr(struct efx_nic *efx, unsigned flr);
+extern void efx_sriov_reset(struct efx_nic *efx);
+extern void efx_sriov_fini(struct efx_nic *efx);
+extern void efx_fini_sriov(void);
+
+#else
+
+static inline bool efx_sriov_wanted(struct efx_nic *efx) { return false; }
+static inline bool efx_sriov_enabled(struct efx_nic *efx) { return false; }
+static inline unsigned int efx_vf_size(struct efx_nic *efx) { return 0; }
+
+static inline int efx_init_sriov(void) { return 0; }
+static inline void efx_sriov_probe(struct efx_nic *efx) {}
+static inline int efx_sriov_init(struct efx_nic *efx) { return -EOPNOTSUPP; }
+static inline void efx_sriov_mac_address_changed(struct efx_nic *efx) {}
+static inline void efx_sriov_tx_flush_done(struct efx_nic *efx,
+					   efx_qword_t *event) {}
+static inline void efx_sriov_rx_flush_done(struct efx_nic *efx,
+					   efx_qword_t *event) {}
+static inline void efx_sriov_event(struct efx_channel *channel,
+				   efx_qword_t *event) {}
+static inline void efx_sriov_desc_fetch_err(struct efx_nic *efx, unsigned dmaq) {}
+static inline void efx_sriov_flr(struct efx_nic *efx, unsigned flr) {}
+static inline void efx_sriov_reset(struct efx_nic *efx) {}
+static inline void efx_sriov_fini(struct efx_nic *efx) {}
+static inline void efx_fini_sriov(void) {}
+
+#endif
+
+extern int efx_sriov_set_vf_mac(struct net_device *dev, int vf, u8 *mac);
+extern int efx_sriov_set_vf_vlan(struct net_device *dev, int vf,
+				 u16 vlan, u8 qos);
+extern int efx_sriov_get_vf_config(struct net_device *dev, int vf,
+				   struct ifla_vf_info *ivf);
+extern int efx_sriov_set_vf_spoofchk(struct net_device *net_dev, int vf,
+				     bool spoofchk);
+
 extern const struct efx_nic_type falcon_a1_nic_type;
 extern const struct efx_nic_type falcon_b0_nic_type;
 extern const struct efx_nic_type siena_a0_nic_type;
--- a/drivers/net/ethernet/sfc/siena.c
+++ b/drivers/net/ethernet/sfc/siena.c
@ -313,6 +313,8 @@ static int siena_probe_nic(struct efx_nic *efx)
 	if (rc)
 		goto fail5;

+	efx_sriov_probe(efx);
+
 	return 0;

 fail5:
--- a/drivers/net/ethernet/sfc/siena_sriov.c
+++ b/drivers/net/ethernet/sfc/siena_sriov.c
--- a/drivers/net/ethernet/sfc/vfdi.h
+++ b/drivers/net/ethernet/sfc/vfdi.h
@ -0,0 +1,254 @@
+/****************************************************************************
+ * Driver for Solarflare Solarstorm network controllers and boards
+ * Copyright 2010-2012 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+#ifndef _VFDI_H
+#define _VFDI_H
+
+/**
+ * DOC: Virtual Function Driver Interface
+ *
+ * This file contains software structures used to form a two way
+ * communication channel between the VF driver and the PF driver,
+ * named Virtual Function Driver Interface (VFDI).
+ *
+ * For the purposes of VFDI, a page is a memory region with size and
+ * alignment of 4K.  All addresses are DMA addresses to be used within
+ * the domain of the relevant VF.
+ *
+ * The only hardware-defined channels for a VF driver to communicate
+ * with the PF driver are the event mailboxes (%FR_CZ_USR_EV
+ * registers).  Writing to these registers generates an event with
+ * EV_CODE = EV_CODE_USR_EV, USER_QID set to the index of the mailbox
+ * and USER_EV_REG_VALUE set to the value written.  The PF driver may
+ * direct or disable delivery of these events by setting
+ * %FR_CZ_USR_EV_CFG.
+ *
+ * The PF driver can send arbitrary events to arbitrary event queues.
+ * However, for consistency, VFDI events from the PF are defined to
+ * follow the same form and be sent to the first event queue assigned
+ * to the VF while that queue is enabled by the VF driver.
+ *
+ * The general form of the variable bits of VFDI events is:
+ *
+ *       0             16                       24   31
+ *      | DATA        | TYPE                   | SEQ   |
+ *
+ * SEQ is a sequence number which should be incremented by 1 (modulo
+ * 256) for each event.  The sequence numbers used in each direction
+ * are independent.
+ *
+ * The VF submits requests of type &struct vfdi_req by sending the
+ * address of the request (ADDR) in a series of 4 events:
+ *
+ *       0             16                       24   31
+ *      | ADDR[0:15]  | VFDI_EV_TYPE_REQ_WORD0 | SEQ   |
+ *      | ADDR[16:31] | VFDI_EV_TYPE_REQ_WORD1 | SEQ+1 |
+ *      | ADDR[32:47] | VFDI_EV_TYPE_REQ_WORD2 | SEQ+2 |
+ *      | ADDR[48:63] | VFDI_EV_TYPE_REQ_WORD3 | SEQ+3 |
+ *
+ * The address must be page-aligned.  After receiving such a valid
+ * series of events, the PF driver will attempt to read the request
+ * and write a response to the same address.  In case of an invalid
+ * sequence of events or a DMA error, there will be no response.
+ *
+ * The VF driver may request that the PF driver writes status
+ * information into its domain asynchronously.  After writing the
+ * status, the PF driver will send an event of the form:
+ *
+ *       0             16                       24   31
+ *      | reserved    | VFDI_EV_TYPE_STATUS    | SEQ   |
+ *
+ * In case the VF must be reset for any reason, the PF driver will
+ * send an event of the form:
+ *
+ *       0             16                       24   31
+ *      | reserved    | VFDI_EV_TYPE_RESET     | SEQ   |
+ *
+ * It is then the responsibility of the VF driver to request
+ * reinitialisation of its queues.
+ */
+#define VFDI_EV_SEQ_LBN 24
+#define VFDI_EV_SEQ_WIDTH 8
+#define VFDI_EV_TYPE_LBN 16
+#define VFDI_EV_TYPE_WIDTH 8
+#define VFDI_EV_TYPE_REQ_WORD0 0
+#define VFDI_EV_TYPE_REQ_WORD1 1
+#define VFDI_EV_TYPE_REQ_WORD2 2
+#define VFDI_EV_TYPE_REQ_WORD3 3
+#define VFDI_EV_TYPE_STATUS 4
+#define VFDI_EV_TYPE_RESET 5
+#define VFDI_EV_DATA_LBN 0
+#define VFDI_EV_DATA_WIDTH 16
+
+struct vfdi_endpoint {
+	u8 mac_addr[ETH_ALEN];
+	__be16 tci;
+};
+
+/**
+ * enum vfdi_op - VFDI operation enumeration
+ * @VFDI_OP_RESPONSE: Indicates a response to the request.
+ * @VFDI_OP_INIT_EVQ: Initialize SRAM entries and initialize an EVQ.
+ * @VFDI_OP_INIT_RXQ: Initialize SRAM entries and initialize an RXQ.
+ * @VFDI_OP_INIT_TXQ: Initialize SRAM entries and initialize a TXQ.
+ * @VFDI_OP_FINI_ALL_QUEUES: Flush all queues, finalize all queues, then
+ *	finalize the SRAM entries.
+ * @VFDI_OP_INSERT_FILTER: Insert a MAC filter targetting the given RXQ.
+ * @VFDI_OP_REMOVE_ALL_FILTERS: Remove all filters.
+ * @VFDI_OP_SET_STATUS_PAGE: Set the DMA page(s) used for status updates
+ *	from PF and write the initial status.
+ * @VFDI_OP_CLEAR_STATUS_PAGE: Clear the DMA page(s) used for status
+ *	updates from PF.
+ */
+enum vfdi_op {
+	VFDI_OP_RESPONSE = 0,
+	VFDI_OP_INIT_EVQ = 1,
+	VFDI_OP_INIT_RXQ = 2,
+	VFDI_OP_INIT_TXQ = 3,
+	VFDI_OP_FINI_ALL_QUEUES = 4,
+	VFDI_OP_INSERT_FILTER = 5,
+	VFDI_OP_REMOVE_ALL_FILTERS = 6,
+	VFDI_OP_SET_STATUS_PAGE = 7,
+	VFDI_OP_CLEAR_STATUS_PAGE = 8,
+	VFDI_OP_LIMIT,
+};
+
+/* Response codes for VFDI operations. Other values may be used in future. */
+#define VFDI_RC_SUCCESS		0
+#define VFDI_RC_ENOMEM		(-12)
+#define VFDI_RC_EINVAL		(-22)
+#define VFDI_RC_EOPNOTSUPP	(-95)
+#define VFDI_RC_ETIMEDOUT	(-110)
+
+/**
+ * struct vfdi_req - Request from VF driver to PF driver
+ * @op: Operation code or response indicator, taken from &enum vfdi_op.
+ * @rc: Response code.  Set to 0 on success or a negative error code on failure.
+ * @u.init_evq.index: Index of event queue to create.
+ * @u.init_evq.buf_count: Number of 4k buffers backing event queue.
+ * @u.init_evq.addr: Array of length %u.init_evq.buf_count containing DMA
+ *	address of each page backing the event queue.
+ * @u.init_rxq.index: Index of receive queue to create.
+ * @u.init_rxq.buf_count: Number of 4k buffers backing receive queue.
+ * @u.init_rxq.evq: Instance of event queue to target receive events at.
+ * @u.init_rxq.label: Label used in receive events.
+ * @u.init_rxq.flags: Unused.
+ * @u.init_rxq.addr: Array of length %u.init_rxq.buf_count containing DMA
+ *	address of each page backing the receive queue.
+ * @u.init_txq.index: Index of transmit queue to create.
+ * @u.init_txq.buf_count: Number of 4k buffers backing transmit queue.
+ * @u.init_txq.evq: Instance of event queue to target transmit completion
+ *	events at.
+ * @u.init_txq.label: Label used in transmit completion events.
+ * @u.init_txq.flags: Checksum offload flags.
+ * @u.init_txq.addr: Array of length %u.init_txq.buf_count containing DMA
+ *	address of each page backing the transmit queue.
+ * @u.mac_filter.rxq: Insert MAC filter at VF local address/VLAN targetting
+ *	all traffic at this receive queue.
+ * @u.mac_filter.flags: MAC filter flags.
+ * @u.set_status_page.dma_addr: Base address for the &struct vfdi_status.
+ *	This address must be such that the structure fits within a page.
+ * @u.set_status_page.peer_page_count: Number of additional pages the VF
+ *	has provided into which peer addresses may be DMAd.
+ * @u.set_status_page.peer_page_addr: Array of DMA addresses of pages.
+ *	If the number of peers exceeds 256, then the VF must provide
+ *	additional pages in this array. The PF will then DMA up to
+ *	512 vfdi_endpoint structures into each page.  These addresses
+ *	must be page-aligned.
+ */
+struct vfdi_req {
+	u32 op;
+	u32 reserved1;
+	s32 rc;
+	u32 reserved2;
+	union {
+		struct {
+			u32 index;
+			u32 buf_count;
+			u64 addr[];
+		} init_evq;
+		struct {
+			u32 index;
+			u32 buf_count;
+			u32 evq;
+			u32 label;
+			u32 flags;
+#define VFDI_RXQ_FLAG_SCATTER_EN 1
+			u32 reserved;
+			u64 addr[];
+		} init_rxq;
+		struct {
+			u32 index;
+			u32 buf_count;
+			u32 evq;
+			u32 label;
+			u32 flags;
+#define VFDI_TXQ_FLAG_IP_CSUM_DIS 1
+#define VFDI_TXQ_FLAG_TCPUDP_CSUM_DIS 2
+			u32 reserved;
+			u64 addr[];
+		} init_txq;
+		struct {
+			u32 rxq;
+			u32 flags;
+#define VFDI_MAC_FILTER_FLAG_RSS 1
+#define VFDI_MAC_FILTER_FLAG_SCATTER 2
+		} mac_filter;
+		struct {
+			u64 dma_addr;
+			u64 peer_page_count;
+			u64 peer_page_addr[];
+		} set_status_page;
+	} u;
+};
+
+/**
+ * struct vfdi_status - Status provided by PF driver to VF driver
+ * @generation_start: A generation count DMA'd to VF *before* the
+ *	rest of the structure.
+ * @generation_end: A generation count DMA'd to VF *after* the
+ *	rest of the structure.
+ * @version: Version of this structure; currently set to 1.  Later
+ *	versions must either be layout-compatible or only be sent to VFs
+ *	that specifically request them.
+ * @length: Total length of this structure including embedded tables
+ * @vi_scale: log2 the number of VIs available on this VF. This quantity
+ *	is used by the hardware for register decoding.
+ * @max_tx_channels: The maximum number of transmit queues the VF can use.
+ * @rss_rxq_count: The number of receive queues present in the shared RSS
+ *	indirection table.
+ * @peer_count: Total number of peers in the complete peer list. If larger
+ *	than ARRAY_SIZE(%peers), then the VF must provide sufficient
+ *	additional pages each of which is filled with vfdi_endpoint structures.
+ * @local: The MAC address and outer VLAN tag of *this* VF
+ * @peers: Table of peer addresses.  The @tci fields in these structures
+ *	are currently unused and must be ignored.  Additional peers are
+ *	written into any additional pages provided by the VF.
+ * @timer_quantum_ns: Timer quantum (nominal period between timer ticks)
+ *	for interrupt moderation timers, in nanoseconds. This member is only
+ *	present if @length is sufficiently large.
+ */
+struct vfdi_status {
+	u32 generation_start;
+	u32 generation_end;
+	u32 version;
+	u32 length;
+	u8 vi_scale;
+	u8 max_tx_channels;
+	u8 rss_rxq_count;
+	u8 reserved1;
+	u16 peer_count;
+	u16 reserved2;
+	struct vfdi_endpoint local;
+	struct vfdi_endpoint peers[256];
+
+	/* Members below here extend version 1 of this structure */
+	u32 timer_quantum_ns;
+};
+
+#endif