From 44f507163d9e51238458ee6904b4d71fb0723723 Mon Sep 17 00:00:00 2001
From: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Date: Mon, 11 Mar 2013 09:28:44 -0600
Subject: [PATCH 1/9] VFIO: Wrapper for getting reference to vfio_device

- Added vfio_device_get_from_dev() as wrapper to get
  reference to vfio_device from struct device.

- Added vfio_device_data() as a wrapper to get device_data from
  vfio_device.

Signed-off-by: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 30 +++++++++++++++++++++++++++++-
 include/linux/vfio.h |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3e60a3..21eddd9e0f26 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -392,12 +392,13 @@ static void vfio_device_release(struct kref *kref)
 }
 
 /* Device reference always implies a group reference */
-static void vfio_device_put(struct vfio_device *device)
+void vfio_device_put(struct vfio_device *device)
 {
 	struct vfio_group *group = device->group;
 	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 	vfio_group_put(group);
 }
+EXPORT_SYMBOL_GPL(vfio_device_put);
 
 static void vfio_device_get(struct vfio_device *device)
 {
@@ -627,6 +628,33 @@ int vfio_add_group_dev(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
+/**
+ * Get a reference to the vfio_device for a device that is known to
+ * be bound to a vfio driver.  The driver implicitly holds a
+ * vfio_device reference between vfio_add_group_dev and
+ * vfio_del_group_dev.  We can therefore use drvdata to increment
+ * that reference from the struct device.  This additional
+ * reference must be released by calling vfio_device_put.
+ */
+struct vfio_device *vfio_device_get_from_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+
+	vfio_device_get(device);
+
+	return device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
+
+/*
+ * Caller must hold a reference to the vfio_device
+ */
+void *vfio_device_data(struct vfio_device *device)
+{
+	return device->device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_device_data);
+
 /* Given a referenced group, check if it contains the device */
 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e86224c54..ac8d488e4372 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
 			      void *device_data);
 
 extern void *vfio_del_group_dev(struct device *dev);
+extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
+extern void vfio_device_put(struct vfio_device *device);
+extern void *vfio_device_data(struct vfio_device *device);
 
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks

From dad9f8972e04cd081a028d8fb1249d746d97fc03 Mon Sep 17 00:00:00 2001
From: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Date: Mon, 11 Mar 2013 09:31:22 -0600
Subject: [PATCH 2/9] VFIO-AER: Vfio-pci driver changes for supporting AER

- New VFIO_SET_IRQ ioctl option to pass the eventfd that is signaled when
  an error occurs in the vfio_pci_device

- Register pci_error_handler for the vfio_pci driver

- When the device encounters an error, the error handler registered by
  the vfio_pci driver gets invoked by the AER infrastructure

- In the error handler, signal the eventfd registered for the device.

- This results in the qemu eventfd handler getting invoked and
  appropriate action taken for the guest.

Signed-off-by: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         | 44 +++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_intrs.c   | 64 +++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_private.h |  1 +
 include/uapi/linux/vfio.h           |  1 +
 4 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 8189cb6a86af..acfcb1ae77bc 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
 
 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
 		}
-	}
+	} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
+		if (pci_is_pcie(vdev->pdev))
+			return 1;
 
 	return 0;
 }
@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
 			return -EINVAL;
 
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+			break;
+		case VFIO_PCI_ERR_IRQ_INDEX:
+			if (pci_is_pcie(vdev->pdev))
+				break;
+		/* pass thru to return error */
+		default:
+			return -EINVAL;
+		}
+
 		info.flags = VFIO_IRQ_INFO_EVENTFD;
 
 		info.count = vfio_pci_get_irq_count(vdev, info.index);
@@ -551,11 +564,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 	kfree(vdev);
 }
 
+static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
+						  pci_channel_state_t state)
+{
+	struct vfio_pci_device *vdev;
+	struct vfio_device *device;
+
+	device = vfio_device_get_from_dev(&pdev->dev);
+	if (device == NULL)
+		return PCI_ERS_RESULT_DISCONNECT;
+
+	vdev = vfio_device_data(device);
+	if (vdev == NULL) {
+		vfio_device_put(device);
+		return PCI_ERS_RESULT_DISCONNECT;
+	}
+
+	if (vdev->err_trigger)
+		eventfd_signal(vdev->err_trigger, 1);
+
+	vfio_device_put(device);
+
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static struct pci_error_handlers vfio_err_handlers = {
+	.error_detected = vfio_pci_aer_err_detected,
+};
+
 static struct pci_driver vfio_pci_driver = {
 	.name		= "vfio-pci",
 	.id_table	= NULL, /* only dynamic ids */
 	.probe		= vfio_pci_probe,
 	.remove		= vfio_pci_remove,
+	.err_handler	= &vfio_err_handlers,
 };
 
 static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 3639371fa697..b84bf2210a91 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -745,6 +745,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
 	return 0;
 }
 
+static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+				    unsigned index, unsigned start,
+				    unsigned count, uint32_t flags, void *data)
+{
+	int32_t fd = *(int32_t *)data;
+	struct pci_dev *pdev = vdev->pdev;
+
+	if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
+	    !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
+		return -EINVAL;
+
+	/*
+	 * device_lock synchronizes setting and checking of
+	 * err_trigger. The vfio_pci_aer_err_detected() is also
+	 * called with device_lock held.
+	 */
+
+	/* DATA_NONE/DATA_BOOL enables loopback testing */
+
+	if (flags & VFIO_IRQ_SET_DATA_NONE) {
+		device_lock(&pdev->dev);
+		if (vdev->err_trigger)
+			eventfd_signal(vdev->err_trigger, 1);
+		device_unlock(&pdev->dev);
+		return 0;
+	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+		uint8_t trigger = *(uint8_t *)data;
+		device_lock(&pdev->dev);
+		if (trigger && vdev->err_trigger)
+			eventfd_signal(vdev->err_trigger, 1);
+		device_unlock(&pdev->dev);
+		return 0;
+	}
+
+	/* Handle SET_DATA_EVENTFD */
+
+	if (fd == -1) {
+		device_lock(&pdev->dev);
+		if (vdev->err_trigger)
+			eventfd_ctx_put(vdev->err_trigger);
+		vdev->err_trigger = NULL;
+		device_unlock(&pdev->dev);
+		return 0;
+	} else if (fd >= 0) {
+		struct eventfd_ctx *efdctx;
+		efdctx = eventfd_ctx_fdget(fd);
+		if (IS_ERR(efdctx))
+			return PTR_ERR(efdctx);
+		device_lock(&pdev->dev);
+		if (vdev->err_trigger)
+			eventfd_ctx_put(vdev->err_trigger);
+		vdev->err_trigger = efdctx;
+		device_unlock(&pdev->dev);
+		return 0;
+	} else
+		return -EINVAL;
+}
 int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 			    unsigned index, unsigned start, unsigned count,
 			    void *data)
@@ -779,6 +836,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
 			break;
 		}
 		break;
+	case VFIO_PCI_ERR_IRQ_INDEX:
+		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+		case VFIO_IRQ_SET_ACTION_TRIGGER:
+			if (pci_is_pcie(vdev->pdev))
+				func = vfio_pci_set_err_trigger;
+			break;
+		}
 	}
 
 	if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index d7e55d03f49e..9c6d5d0f3b02 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -56,6 +56,7 @@ struct vfio_pci_device {
 	bool			has_vga;
 	struct pci_saved_state	*pci_saved_state;
 	atomic_t		refcnt;
+	struct eventfd_ctx	*err_trigger;
 };
 
 #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4f41f309911e..284ff2436829 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -319,6 +319,7 @@ enum {
 	VFIO_PCI_INTX_IRQ_INDEX,
 	VFIO_PCI_MSI_IRQ_INDEX,
 	VFIO_PCI_MSIX_IRQ_INDEX,
+	VFIO_PCI_ERR_IRQ_INDEX,
 	VFIO_PCI_NUM_IRQS
 };
 

From 0bced2f7280cd12b66229ba55d05c62a4eef6cfd Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Mon, 25 Mar 2013 11:15:44 -0600
Subject: [PATCH 3/9] vfio: make local function vfio_pci_intx_unmask_handler()
 static

vfio_pci_intx_unmask_handler() was not declared. It should be static.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index b84bf2210a91..c4429418124e 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -286,7 +286,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  * a signal is necessary, which can then be handled via a work queue
  * or directly depending on the caller.
  */
-int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
+static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
+					void *unused)
 {
 	struct pci_dev *pdev = vdev->pdev;
 	unsigned long flags;

From 180b1381078924b2442a42cded514afd6faff458 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 1 Apr 2013 09:03:44 -0600
Subject: [PATCH 4/9] vfio-pci: Use byte granularity in config map

The config map previously used a byte per dword to map regions of
config space to capabilities.  Modulo a bug where we round the length
of capabilities down instead of up, this theoretically works well and
saves space so long as devices don't try to hide registers in the gaps
between capabilities.  Unfortunately they do exactly that so we need
byte granularity on our config space map.  Increase the allocation of
the config map and split accesses at capability region boundaries.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 88 ++++++++++++++++--------------
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 964ff22bf281..5d5fc75743ce 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -800,9 +800,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
 	u8 cap;
 	int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
 						 PCI_STD_HEADER_SIZEOF;
-	base /= 4;
-	pos /= 4;
-
 	cap = vdev->pci_config_map[pos];
 
 	if (cap == PCI_CAP_ID_BASIC)
@@ -812,7 +809,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
 	while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
 		pos--;
 
-	return pos * 4;
+	return pos;
 }
 
 static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
@@ -1229,8 +1226,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
 		}
 
 		/* Sanity check, do we overlap other capabilities? */
-		for (i = 0; i < len; i += 4) {
-			if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID))
+		for (i = 0; i < len; i++) {
+			if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
 				continue;
 
 			pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
@@ -1238,7 +1235,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
 				pos + i, map[pos + i], cap);
 		}
 
-		memset(map + (pos / 4), cap, len / 4);
+		memset(map + pos, cap, len);
 		ret = vfio_fill_vconfig_bytes(vdev, pos, len);
 		if (ret)
 			return ret;
@@ -1313,8 +1310,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
 			hidden = true;
 		}
 
-		for (i = 0; i < len; i += 4) {
-			if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID))
+		for (i = 0; i < len; i++) {
+			if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
 				continue;
 
 			pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
@@ -1329,7 +1326,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
 		 */
 		BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
 
-		memset(map + (epos / 4), ecap, len / 4);
+		memset(map + epos, ecap, len);
 		ret = vfio_fill_vconfig_bytes(vdev, epos, len);
 		if (ret)
 			return ret;
@@ -1376,10 +1373,12 @@ int vfio_config_init(struct vfio_pci_device *vdev)
 	int ret;
 
 	/*
-	 * Config space, caps and ecaps are all dword aligned, so we can
-	 * use one byte per dword to record the type.
+	 * Config space, caps and ecaps are all dword aligned, so we could
+	 * use one byte per dword to record the type.  However, there are
+	 * no requiremenst on the length of a capability, so the gap between
+	 * capabilities needs byte granularity.
 	 */
-	map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL);
+	map = kmalloc(pdev->cfg_size, GFP_KERNEL);
 	if (!map)
 		return -ENOMEM;
 
@@ -1392,9 +1391,9 @@ int vfio_config_init(struct vfio_pci_device *vdev)
 	vdev->pci_config_map = map;
 	vdev->vconfig = vconfig;
 
-	memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4);
-	memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID,
-	       (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4);
+	memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
+	memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
+	       pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
 
 	ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
 	if (ret)
@@ -1449,6 +1448,22 @@ void vfio_config_free(struct vfio_pci_device *vdev)
 	vdev->msi_perm = NULL;
 }
 
+/*
+ * Find the remaining number of bytes in a dword that match the given
+ * position.  Stop at either the end of the capability or the dword boundary.
+ */
+static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
+					   loff_t pos)
+{
+	u8 cap = vdev->pci_config_map[pos];
+	size_t i;
+
+	for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
+		/* nop */;
+
+	return i;
+}
+
 static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 				 size_t count, loff_t *ppos, bool iswrite)
 {
@@ -1457,19 +1472,27 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 	__le32 val = 0;
 	int cap_start = 0, offset;
 	u8 cap_id;
-	ssize_t ret = count;
+	ssize_t ret;
 
-	if (*ppos < 0 || *ppos + count > pdev->cfg_size)
+	if (*ppos < 0 || *ppos >= pdev->cfg_size ||
+	    *ppos + count > pdev->cfg_size)
 		return -EFAULT;
 
 	/*
-	 * gcc can't seem to figure out we're a static function, only called
-	 * with count of 1/2/4 and hits copy_from_user_overflow without this.
+	 * Chop accesses into aligned chunks containing no more than a
+	 * single capability.  Caller increments to the next chunk.
 	 */
-	if (count > sizeof(val))
-		return -EINVAL;
+	count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
+	if (count >= 4 && !(*ppos % 4))
+		count = 4;
+	else if (count >= 2 && !(*ppos % 2))
+		count = 2;
+	else
+		count = 1;
 
-	cap_id = vdev->pci_config_map[*ppos / 4];
+	ret = count;
+
+	cap_id = vdev->pci_config_map[*ppos];
 
 	if (cap_id == PCI_CAP_ID_INVALID) {
 		if (iswrite)
@@ -1485,11 +1508,6 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 		return ret;
 	}
 
-	/*
-	 * All capabilities are minimum 4 bytes and aligned on dword
-	 * boundaries.  Since we don't support unaligned accesses, we're
-	 * only ever accessing a single capability.
-	 */
 	if (*ppos >= PCI_CFG_SPACE_SIZE) {
 		WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
 
@@ -1545,20 +1563,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
 
 	pos &= VFIO_PCI_OFFSET_MASK;
 
-	/*
-	 * We want to both keep the access size the caller users as well as
-	 * support reading large chunks of config space in a single call.
-	 * PCI doesn't support unaligned accesses, so we can safely break
-	 * those apart.
-	 */
 	while (count) {
-		if (count >= 4 && !(pos % 4))
-			ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
-		else if (count >= 2 && !(pos % 2))
-			ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
-		else
-			ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
-
+		ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
 		if (ret < 0)
 			return ret;
 

From a7d1ea1c11b33bda2691f3294b4d735ed635535a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 1 Apr 2013 09:04:12 -0600
Subject: [PATCH 5/9] vfio-pci: Enable raw access to unassigned config space

Devices like be2net hide registers between the gaps in capabilities
and architected regions of PCI config space.  Our choices to support
such devices is to either build an ever growing and unmanageable white
list or rely on hardware isolation to protect us.  These registers are
really no different than MMIO or I/O port space registers, which we
don't attempt to regulate, so treat PCI config space in the same way.

Reported-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 80 ++++++++++++++++++------------
 1 file changed, 47 insertions(+), 33 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 5d5fc75743ce..072fdacb6f96 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -273,9 +273,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
-				    int count, struct perm_bits *perm,
-				    int offset, __le32 val)
+/* Raw access skips any kind of virtualization */
+static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
+				 int count, struct perm_bits *perm,
+				 int offset, __le32 val)
 {
 	int ret;
 
@@ -286,13 +287,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
 	return count;
 }
 
-/* Default all regions to read-only, no-virtualization */
+static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
+				int count, struct perm_bits *perm,
+				int offset, __le32 *val)
+{
+	int ret;
+
+	ret = vfio_user_config_read(vdev->pdev, pos, val, count);
+	if (ret)
+		return pcibios_err_to_errno(ret);
+
+	return count;
+}
+
+/* Default capability regions to read-only, no-virtualization */
 static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
 	[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
 };
 static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
 	[0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
 };
+/*
+ * Default unassigned regions to raw read-write access.  Some devices
+ * require this to function as they hide registers between the gaps in
+ * config space (be2net).  Like MMIO and I/O port registers, we have
+ * to trust the hardware isolation.
+ */
+static struct perm_bits unassigned_perms = {
+	.readfn = vfio_raw_config_read,
+	.writefn = vfio_raw_config_write
+};
 
 static void free_perm_bits(struct perm_bits *perm)
 {
@@ -778,16 +802,16 @@ int __init vfio_pci_init_perm_bits(void)
 
 	/* Capabilities */
 	ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
-	cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write;
+	cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
 	ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
-	cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write;
+	cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
 	ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
 	ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
 
 	/* Extended capabilities */
 	ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
 	ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
-	ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write;
+	ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
 
 	if (ret)
 		vfio_pci_uninit_perm_bits();
@@ -1495,35 +1519,25 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
 	cap_id = vdev->pci_config_map[*ppos];
 
 	if (cap_id == PCI_CAP_ID_INVALID) {
-		if (iswrite)
-			return ret; /* drop */
-
-		/*
-		 * Per PCI spec 3.0, section 6.1, reads from reserved and
-		 * unimplemented registers return 0
-		 */
-		if (copy_to_user(buf, &val, count))
-			return -EFAULT;
-
-		return ret;
-	}
-
-	if (*ppos >= PCI_CFG_SPACE_SIZE) {
-		WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
-
-		perm = &ecap_perms[cap_id];
-		cap_start = vfio_find_cap_start(vdev, *ppos);
-
+		perm = &unassigned_perms;
+		cap_start = *ppos;
 	} else {
-		WARN_ON(cap_id > PCI_CAP_ID_MAX);
+		if (*ppos >= PCI_CFG_SPACE_SIZE) {
+			WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
 
-		perm = &cap_perms[cap_id];
-
-		if (cap_id == PCI_CAP_ID_MSI)
-			perm = vdev->msi_perm;
-
-		if (cap_id > PCI_CAP_ID_BASIC)
+			perm = &ecap_perms[cap_id];
 			cap_start = vfio_find_cap_start(vdev, *ppos);
+		} else {
+			WARN_ON(cap_id > PCI_CAP_ID_MAX);
+
+			perm = &cap_perms[cap_id];
+
+			if (cap_id == PCI_CAP_ID_MSI)
+				perm = vdev->msi_perm;
+
+			if (cap_id > PCI_CAP_ID_BASIC)
+				cap_start = vfio_find_cap_start(vdev, *ppos);
+		}
 	}
 
 	WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);

From aa2cba51a0a85dfbd5be58239e59bc5e8b5fb7cf Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Mon, 15 Apr 2013 08:45:10 -0600
Subject: [PATCH 6/9] PCI/VFIO: use pcie_flags_reg instead of access PCI-E
 Capabilities Register

Currently, we use pcie_flags_reg to cache PCI-E Capabilities Register,
because PCI-E Capabilities Register bits are almost read-only. This patch
use pcie_caps_reg() instead of another access PCI-E Capabilities Register.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 072fdacb6f96..0e83e8ead142 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1037,13 +1037,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
 		return byte;
 	case PCI_CAP_ID_EXP:
 		/* length based on version */
-		ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
-		if (ret)
-			return pcibios_err_to_errno(ret);
-
 		vdev->extended_caps = true;
 
-		if ((word & PCI_EXP_FLAGS_VERS) == 1)
+		if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
 			return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
 		else
 			return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;

From 9587f44aa69a4cfb13701b31a633852684bed701 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 25 Apr 2013 16:12:38 -0600
Subject: [PATCH 7/9] vfio: Convert container->group_lock to rwsem

All current users are writers, maintaining current mutual exclusion.
This lets us add read users next.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 21eddd9e0f26..073788e50e4b 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -24,6 +24,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
+#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -57,7 +58,7 @@ struct vfio_iommu_driver {
 struct vfio_container {
 	struct kref			kref;
 	struct list_head		group_list;
-	struct mutex			group_lock;
+	struct rw_semaphore		group_lock;
 	struct vfio_iommu_driver	*iommu_driver;
 	void				*iommu_data;
 };
@@ -738,7 +739,7 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
 	return ret;
 }
 
-/* hold container->group_lock */
+/* hold write lock on container->group_lock */
 static int __vfio_container_attach_groups(struct vfio_container *container,
 					  struct vfio_iommu_driver *driver,
 					  void *data)
@@ -769,7 +770,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
 	struct vfio_iommu_driver *driver;
 	long ret = -ENODEV;
 
-	mutex_lock(&container->group_lock);
+	down_write(&container->group_lock);
 
 	/*
 	 * The container is designed to be an unprivileged interface while
@@ -780,7 +781,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
 	 * the container is deprivileged and returns to an unset state.
 	 */
 	if (list_empty(&container->group_list) || container->iommu_driver) {
-		mutex_unlock(&container->group_lock);
+		up_write(&container->group_lock);
 		return -EINVAL;
 	}
 
@@ -827,7 +828,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
 
 	mutex_unlock(&vfio.iommu_drivers_lock);
 skip_drivers_unlock:
-	mutex_unlock(&container->group_lock);
+	up_write(&container->group_lock);
 
 	return ret;
 }
@@ -882,7 +883,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&container->group_list);
-	mutex_init(&container->group_lock);
+	init_rwsem(&container->group_lock);
 	kref_init(&container->kref);
 
 	filep->private_data = container;
@@ -961,7 +962,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
 	struct vfio_container *container = group->container;
 	struct vfio_iommu_driver *driver;
 
-	mutex_lock(&container->group_lock);
+	down_write(&container->group_lock);
 
 	driver = container->iommu_driver;
 	if (driver)
@@ -979,7 +980,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
 		container->iommu_data = NULL;
 	}
 
-	mutex_unlock(&container->group_lock);
+	up_write(&container->group_lock);
 
 	vfio_container_put(container);
 }
@@ -1039,7 +1040,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 	container = f.file->private_data;
 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
 
-	mutex_lock(&container->group_lock);
+	down_write(&container->group_lock);
 
 	driver = container->iommu_driver;
 	if (driver) {
@@ -1057,7 +1058,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 	atomic_inc(&group->container_users);
 
 unlock_out:
-	mutex_unlock(&container->group_lock);
+	up_write(&container->group_lock);
 	fdput(f);
 	return ret;
 }

From 0b43c082338b857c27d2f87c886eb78812ccd236 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 29 Apr 2013 08:41:36 -0600
Subject: [PATCH 8/9] vfio: Use down_reads to protect iommu disconnects

If a group or device is released or a container is unset from a group
it can race against file ops on the container.  Protect these with
down_reads to allow concurrent users.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reported-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vfio/vfio.c | 62 +++++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 16 deletions(-)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 073788e50e4b..ac7423bfaa7d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -704,9 +704,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
 static long vfio_ioctl_check_extension(struct vfio_container *container,
 				       unsigned long arg)
 {
-	struct vfio_iommu_driver *driver = container->iommu_driver;
+	struct vfio_iommu_driver *driver;
 	long ret = 0;
 
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+
 	switch (arg) {
 		/* No base extensions yet */
 	default:
@@ -736,6 +740,8 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
 						 VFIO_CHECK_EXTENSION, arg);
 	}
 
+	up_read(&container->group_lock);
+
 	return ret;
 }
 
@@ -844,9 +850,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
 	if (!container)
 		return ret;
 
-	driver = container->iommu_driver;
-	data = container->iommu_data;
-
 	switch (cmd) {
 	case VFIO_GET_API_VERSION:
 		ret = VFIO_API_VERSION;
@@ -858,8 +861,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
 		ret = vfio_ioctl_set_iommu(container, arg);
 		break;
 	default:
+		down_read(&container->group_lock);
+
+		driver = container->iommu_driver;
+		data = container->iommu_data;
+
 		if (driver) /* passthrough all unrecognized ioctls */
 			ret = driver->ops->ioctl(data, cmd, arg);
+
+		up_read(&container->group_lock);
 	}
 
 	return ret;
@@ -910,35 +920,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
 			      size_t count, loff_t *ppos)
 {
 	struct vfio_container *container = filep->private_data;
-	struct vfio_iommu_driver *driver = container->iommu_driver;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
 
-	if (unlikely(!driver || !driver->ops->read))
-		return -EINVAL;
+	down_read(&container->group_lock);
 
-	return driver->ops->read(container->iommu_data, buf, count, ppos);
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->read))
+		ret = driver->ops->read(container->iommu_data,
+					buf, count, ppos);
+
+	up_read(&container->group_lock);
+
+	return ret;
 }
 
 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
 			       size_t count, loff_t *ppos)
 {
 	struct vfio_container *container = filep->private_data;
-	struct vfio_iommu_driver *driver = container->iommu_driver;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret = -EINVAL;
 
-	if (unlikely(!driver || !driver->ops->write))
-		return -EINVAL;
+	down_read(&container->group_lock);
 
-	return driver->ops->write(container->iommu_data, buf, count, ppos);
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->write))
+		ret = driver->ops->write(container->iommu_data,
+					 buf, count, ppos);
+
+	up_read(&container->group_lock);
+
+	return ret;
 }
 
 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
 {
 	struct vfio_container *container = filep->private_data;
-	struct vfio_iommu_driver *driver = container->iommu_driver;
+	struct vfio_iommu_driver *driver;
+	int ret = -EINVAL;
 
-	if (unlikely(!driver || !driver->ops->mmap))
-		return -EINVAL;
+	down_read(&container->group_lock);
 
-	return driver->ops->mmap(container->iommu_data, vma);
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->mmap))
+		ret = driver->ops->mmap(container->iommu_data, vma);
+
+	up_read(&container->group_lock);
+
+	return ret;
 }
 
 static const struct file_operations vfio_fops = {

From 664e9386bd05dbdfecfb28d6cf2fde983aabc65c Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 30 Apr 2013 15:42:28 -0600
Subject: [PATCH 9/9] vfio: Set container device mode

Minor 0 is the VFIO container device (/dev/vfio/vfio).  On it's own
the container does not provide a user with any privileged access.  It
only supports API version check and extension check ioctls.  Only by
attaching a VFIO group to the container does it gain any access.  Set
the mode of the container to allow access.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index ac7423bfaa7d..acb7121a9316 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -27,6 +27,7 @@
 #include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
@@ -1359,6 +1360,9 @@ static const struct file_operations vfio_device_fops = {
  */
 static char *vfio_devnode(struct device *dev, umode_t *mode)
 {
+	if (MINOR(dev->devt) == 0)
+		*mode = S_IRUGO | S_IWUGO;
+
 	return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
 }