From a941b784b15ff65e1a3b6a259c6d6cf7fa0bb3c3 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Fri, 15 Sep 2023 10:24:21 +0530
Subject: [PATCH 01/18] cdx: add support for bus mastering

Introduce cdx_set_master() and cdx_clear_master() APIs to support
enable and disable of bus mastering. Drivers need to use these APIs to
enable/disable DMAs from the CDX devices.

Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansen-van-vuuren@amd.com>
Link: https://lore.kernel.org/r/20230915045423.31630-1-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/cdx/cdx.c                       | 32 ++++++++++++++
 drivers/cdx/controller/cdx_controller.c |  4 ++
 drivers/cdx/controller/mcdi_functions.c | 58 +++++++++++++++++++++++++
 drivers/cdx/controller/mcdi_functions.h | 13 ++++++
 include/linux/cdx/cdx_bus.h             | 18 ++++++++
 5 files changed, 125 insertions(+)

diff --git a/drivers/cdx/cdx.c b/drivers/cdx/cdx.c
index d2cad4c670a0..9efb7584f952 100644
--- a/drivers/cdx/cdx.c
+++ b/drivers/cdx/cdx.c
@@ -182,6 +182,38 @@ cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev)
 	return NULL;
 }
 
+int cdx_set_master(struct cdx_device *cdx_dev)
+{
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	struct cdx_device_config dev_config;
+	int ret = -EOPNOTSUPP;
+
+	dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+	dev_config.bus_master_enable = true;
+	if (cdx->ops->dev_configure)
+		ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+					      cdx_dev->dev_num, &dev_config);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_set_master);
+
+int cdx_clear_master(struct cdx_device *cdx_dev)
+{
+	struct cdx_controller *cdx = cdx_dev->cdx;
+	struct cdx_device_config dev_config;
+	int ret = -EOPNOTSUPP;
+
+	dev_config.type = CDX_DEV_BUS_MASTER_CONF;
+	dev_config.bus_master_enable = false;
+	if (cdx->ops->dev_configure)
+		ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
+					      cdx_dev->dev_num, &dev_config);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cdx_clear_master);
+
 /**
  * cdx_bus_match - device to driver matching callback
  * @dev: the cdx device to match against
diff --git a/drivers/cdx/controller/cdx_controller.c b/drivers/cdx/controller/cdx_controller.c
index bb4ae7970e21..7828dac8edb1 100644
--- a/drivers/cdx/controller/cdx_controller.c
+++ b/drivers/cdx/controller/cdx_controller.c
@@ -56,6 +56,10 @@ static int cdx_configure_device(struct cdx_controller *cdx,
 	case CDX_DEV_RESET_CONF:
 		ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num);
 		break;
+	case CDX_DEV_BUS_MASTER_CONF:
+		ret = cdx_mcdi_bus_master_enable(cdx->priv, bus_num, dev_num,
+						 dev_config->bus_master_enable);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/drivers/cdx/controller/mcdi_functions.c b/drivers/cdx/controller/mcdi_functions.c
index 0158f26533dd..fc82435d5dea 100644
--- a/drivers/cdx/controller/mcdi_functions.c
+++ b/drivers/cdx/controller/mcdi_functions.c
@@ -137,3 +137,61 @@ int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
 
 	return ret;
 }
+
+static int cdx_mcdi_ctrl_flag_get(struct cdx_mcdi *cdx, u8 bus_num,
+				  u8 dev_num, u32 *flags)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_IN_LEN);
+	MCDI_DECLARE_BUF(outbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN);
+	size_t outlen;
+	int ret;
+
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_BUS, bus_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_DEVICE, dev_num);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_GET, inbuf,
+			   sizeof(inbuf), outbuf, sizeof(outbuf), &outlen);
+	if (ret)
+		return ret;
+
+	if (outlen != MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN)
+		return -EIO;
+
+	*flags = MCDI_DWORD(outbuf, CDX_DEVICE_CONTROL_GET_OUT_FLAGS);
+
+	return 0;
+}
+
+static int cdx_mcdi_ctrl_flag_set(struct cdx_mcdi *cdx, u8 bus_num,
+				  u8 dev_num, bool enable, int bit_pos)
+{
+	MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_SET_IN_LEN);
+	u32 flags;
+	int ret;
+
+	/*
+	 * Get flags and then set/reset bit at bit_pos according to
+	 * the input params.
+	 */
+	ret = cdx_mcdi_ctrl_flag_get(cdx, bus_num, dev_num, &flags);
+	if (ret)
+		return ret;
+
+	flags = flags & (u32)(~(BIT(bit_pos)));
+	if (enable)
+		flags |= (1 << bit_pos);
+
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_BUS, bus_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_DEVICE, dev_num);
+	MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_FLAGS, flags);
+	ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_SET, inbuf,
+			   sizeof(inbuf), NULL, 0, NULL);
+
+	return ret;
+}
+
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+			       u8 dev_num, bool enable)
+{
+	return cdx_mcdi_ctrl_flag_set(cdx, bus_num, dev_num, enable,
+			MC_CMD_CDX_DEVICE_CONTROL_SET_IN_BUS_MASTER_ENABLE_LBN);
+}
diff --git a/drivers/cdx/controller/mcdi_functions.h b/drivers/cdx/controller/mcdi_functions.h
index 7440ace5539a..a448d6581eb4 100644
--- a/drivers/cdx/controller/mcdi_functions.h
+++ b/drivers/cdx/controller/mcdi_functions.h
@@ -58,4 +58,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
 int cdx_mcdi_reset_device(struct cdx_mcdi *cdx,
 			  u8 bus_num, u8 dev_num);
 
+/**
+ * cdx_mcdi_bus_master_enable - Set/Reset bus mastering for cdx device
+ *				represented by bus_num:dev_num
+ * @cdx: pointer to MCDI interface.
+ * @bus_num: Bus number.
+ * @dev_num: Device number.
+ * @enable: Enable bus mastering if set, disable otherwise.
+ *
+ * Return: 0 on success, <0 on failure
+ */
+int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
+			       u8 dev_num, bool enable);
+
 #endif /* CDX_MCDI_FUNCTIONS_H */
diff --git a/include/linux/cdx/cdx_bus.h b/include/linux/cdx/cdx_bus.h
index bead71b7bc73..8320ec3b9e37 100644
--- a/include/linux/cdx/cdx_bus.h
+++ b/include/linux/cdx/cdx_bus.h
@@ -21,11 +21,13 @@
 struct cdx_controller;
 
 enum {
+	CDX_DEV_BUS_MASTER_CONF,
 	CDX_DEV_RESET_CONF,
 };
 
 struct cdx_device_config {
 	u8 type;
+	bool bus_master_enable;
 };
 
 typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
@@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type;
  */
 int cdx_dev_reset(struct device *dev);
 
+/**
+ * cdx_set_master - enables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to enable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_set_master(struct cdx_device *cdx_dev);
+
+/**
+ * cdx_clear_master - disables bus-mastering for CDX device
+ * @cdx_dev: the CDX device to disable
+ *
+ * Return: 0 for success, -errno on failure
+ */
+int cdx_clear_master(struct cdx_device *cdx_dev);
+
 #endif /* _CDX_BUS_H_ */

From f59a7b6af012619199c55f587a6c4ed681639b32 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Fri, 15 Sep 2023 10:24:22 +0530
Subject: [PATCH 02/18] vfio: add bus master feature to device feature ioctl

add bus mastering control to VFIO_DEVICE_FEATURE IOCTL. The VFIO user
can use this feature to enable or disable the Bus Mastering of a
device bound to VFIO.

Co-developed-by: Shubham Rohila <shubham.rohila@amd.com>
Signed-off-by: Shubham Rohila <shubham.rohila@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Link: https://lore.kernel.org/r/20230915045423.31630-2-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index afc1369216d9..b177424149f4 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -1434,6 +1434,27 @@ struct vfio_device_feature_mig_data_size {
 
 #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
 
+/**
+ * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device
+ * based on the operation specified in op flag.
+ *
+ * The functionality is incorporated for devices that needs bus master control,
+ * but the in-band device interface lacks the support. Consequently, it is not
+ * applicable to PCI devices, as bus master control for PCI devices is managed
+ * in-band through the configuration space. At present, this feature is supported
+ * only for CDX devices.
+ * When the device's BUS MASTER setting is configured as CLEAR, it will result in
+ * blocking all incoming DMA requests from the device. On the other hand, configuring
+ * the device's BUS MASTER setting as SET (enable) will grant the device the
+ * capability to perform DMA to the host memory.
+ */
+struct vfio_device_feature_bus_master {
+	__u32 op;
+#define		VFIO_DEVICE_FEATURE_CLEAR_MASTER	0	/* Clear Bus Master */
+#define		VFIO_DEVICE_FEATURE_SET_MASTER		1	/* Set Bus Master */
+};
+#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
+
 /* -------- API for Type1 VFIO IOMMU -------- */
 
 /**

From 8a97ab9b8b31790d0014d3183f33a167454171c0 Mon Sep 17 00:00:00 2001
From: Nipun Gupta <nipun.gupta@amd.com>
Date: Fri, 15 Sep 2023 10:24:23 +0530
Subject: [PATCH 03/18] vfio-cdx: add bus mastering device feature support

Support Bus master enable and disable on VFIO-CDX devices using
VFIO_DEVICE_FEATURE_BUS_MASTER flag over VFIO_DEVICE_FEATURE IOCTL.

Co-developed-by: Shubham Rohila <shubham.rohila@amd.com>
Signed-off-by: Shubham Rohila <shubham.rohila@amd.com>
Signed-off-by: Nipun Gupta <nipun.gupta@amd.com>
Link: https://lore.kernel.org/r/20230915045423.31630-3-nipun.gupta@amd.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/cdx/main.c    | 57 +++++++++++++++++++++++++++++++++++++-
 drivers/vfio/cdx/private.h |  2 ++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index de56686581ae..a437630be354 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
 		container_of(core_vdev, struct vfio_cdx_device, vdev);
 	struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
 	int count = cdx_dev->res_count;
-	int i;
+	int i, ret;
 
 	vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region),
 				GFP_KERNEL_ACCOUNT);
@@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
 		if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY))
 			vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
 	}
+	ret = cdx_dev_reset(core_vdev->dev);
+	if (ret) {
+		kfree(vdev->regions);
+		vdev->regions = NULL;
+		return ret;
+	}
+	ret = cdx_clear_master(cdx_dev);
+	if (ret)
+		vdev->flags &= ~BME_SUPPORT;
+	else
+		vdev->flags |= BME_SUPPORT;
 
 	return 0;
 }
@@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
 	cdx_dev_reset(core_vdev->dev);
 }
 
+static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
+			    void __user *arg, size_t argsz)
+{
+	size_t minsz =
+		offsetofend(struct vfio_device_feature_bus_master, op);
+	struct vfio_cdx_device *vdev =
+		container_of(core_vdev, struct vfio_cdx_device, vdev);
+	struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
+	struct vfio_device_feature_bus_master ops;
+	int ret;
+
+	if (!vdev->flags & BME_SUPPORT)
+		return -ENOTTY;
+
+	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
+				 sizeof(ops));
+	if (ret != 1)
+		return ret;
+
+	if (copy_from_user(&ops, arg, minsz))
+		return -EFAULT;
+
+	switch (ops.op) {
+	case VFIO_DEVICE_FEATURE_CLEAR_MASTER:
+		return cdx_clear_master(cdx_dev);
+	case VFIO_DEVICE_FEATURE_SET_MASTER:
+		return cdx_set_master(cdx_dev);
+	default:
+		return -EINVAL;
+	}
+}
+
+static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags,
+				  void __user *arg, size_t argsz)
+{
+	switch (flags & VFIO_DEVICE_FEATURE_MASK) {
+	case VFIO_DEVICE_FEATURE_BUS_MASTER:
+		return vfio_cdx_bm_ctrl(device, flags, arg, argsz);
+	default:
+		return -ENOTTY;
+	}
+}
+
 static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
 				   struct vfio_device_info __user *arg)
 {
@@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
 	.open_device	= vfio_cdx_open_device,
 	.close_device	= vfio_cdx_close_device,
 	.ioctl		= vfio_cdx_ioctl,
+	.device_feature = vfio_cdx_ioctl_feature,
 	.mmap		= vfio_cdx_mmap,
 	.bind_iommufd	= vfio_iommufd_physical_bind,
 	.unbind_iommufd	= vfio_iommufd_physical_unbind,
diff --git a/drivers/vfio/cdx/private.h b/drivers/vfio/cdx/private.h
index 8bdc117ea88e..8e9d25913728 100644
--- a/drivers/vfio/cdx/private.h
+++ b/drivers/vfio/cdx/private.h
@@ -23,6 +23,8 @@ struct vfio_cdx_region {
 struct vfio_cdx_device {
 	struct vfio_device	vdev;
 	struct vfio_cdx_region	*regions;
+	u32			flags;
+#define BME_SUPPORT BIT(0)
 };
 
 #endif /* VFIO_CDX_PRIVATE_H */

From 2f8d25fa8aed030d7d049f0aef1b78713f431a79 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Sep 2023 16:56:15 -0400
Subject: [PATCH 04/18] vfio: trivially use __aligned_u64 for ioctl structs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

u64 alignment behaves differently depending on the architecture and so
<uapi/linux/types.h> offers __aligned_u64 to achieve consistent behavior
in kernel<->userspace ABIs.

There are structs in <uapi/linux/vfio.h> that can trivially be updated
to __aligned_u64 because the struct sizes are multiples of 8 bytes.
There is no change in memory layout on any CPU architecture and
therefore this change is safe.

The commits that follow this one handle the trickier cases where
explanation about ABI breakage is necessary.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20230918205617.1478722-2-stefanha@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b177424149f4..325df3246691 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -277,8 +277,8 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
 	__u32	index;		/* Region index */
 	__u32	cap_offset;	/* Offset within info struct of first cap */
-	__u64	size;		/* Region size (bytes) */
-	__u64	offset;		/* Region offset from start of device fd */
+	__aligned_u64	size;	/* Region size (bytes) */
+	__aligned_u64	offset;	/* Region offset from start of device fd */
 };
 #define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
 
@@ -294,8 +294,8 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_CAP_SPARSE_MMAP	1
 
 struct vfio_region_sparse_mmap_area {
-	__u64	offset;	/* Offset of mmap'able area within region */
-	__u64	size;	/* Size of mmap'able area */
+	__aligned_u64	offset;	/* Offset of mmap'able area within region */
+	__aligned_u64	size;	/* Size of mmap'able area */
 };
 
 struct vfio_region_info_cap_sparse_mmap {
@@ -450,9 +450,9 @@ struct vfio_device_migration_info {
 					     VFIO_DEVICE_STATE_V1_RESUMING)
 
 	__u32 reserved;
-	__u64 pending_bytes;
-	__u64 data_offset;
-	__u64 data_size;
+	__aligned_u64 pending_bytes;
+	__aligned_u64 data_offset;
+	__aligned_u64 data_size;
 };
 
 /*
@@ -476,7 +476,7 @@ struct vfio_device_migration_info {
 
 struct vfio_region_info_cap_nvlink2_ssatgt {
 	struct vfio_info_cap_header header;
-	__u64 tgt;
+	__aligned_u64 tgt;
 };
 
 /*
@@ -1470,7 +1470,7 @@ struct vfio_iommu_type1_info {
 	__u32	flags;
 #define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
 #define VFIO_IOMMU_INFO_CAPS	(1 << 1)	/* Info supports caps */
-	__u64	iova_pgsizes;	/* Bitmap of supported page sizes */
+	__aligned_u64	iova_pgsizes;		/* Bitmap of supported page sizes */
 	__u32   cap_offset;	/* Offset within info struct of first cap */
 	__u32   pad;
 };

From a7bea9f4fecce0afd37ee58a552eef71d8b4ab9f Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Sep 2023 16:56:16 -0400
Subject: [PATCH 05/18] vfio: use __aligned_u64 in struct
 vfio_device_gfx_plane_info

The memory layout of struct vfio_device_gfx_plane_info is
architecture-dependent due to a u64 field and a struct size that is not
a multiple of 8 bytes:
- On x86_64 the struct size is padded to a multiple of 8 bytes.
- On x32 the struct size is only a multiple of 4 bytes, not 8.
- Other architectures may vary.

Use __aligned_u64 to make memory layout consistent. This reduces the
chance of 32-bit userspace on a 64-bit kernel breakage.

This patch increases the struct size on x32 but this is safe because of
the struct's argsz field. The kernel may grow the struct as long as it
still supports smaller argsz values from userspace (e.g. applications
compiled against older kernel headers).

Suggested-by: Jason Gunthorpe <jgg@ziepe.ca>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20230918205617.1478722-3-stefanha@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +-
 include/uapi/linux/vfio.h        | 3 ++-
 samples/vfio-mdev/mbochs.c       | 2 +-
 samples/vfio-mdev/mdpy.c         | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c
index 42ce20e72db7..faf21be724c3 100644
--- a/drivers/gpu/drm/i915/gvt/kvmgt.c
+++ b/drivers/gpu/drm/i915/gvt/kvmgt.c
@@ -1379,7 +1379,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
 		intel_gvt_reset_vgpu(vgpu);
 		return 0;
 	} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
-		struct vfio_device_gfx_plane_info dmabuf;
+		struct vfio_device_gfx_plane_info dmabuf = {};
 		int ret = 0;
 
 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 325df3246691..8a3f83e0bdc1 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info {
 	__u32 drm_plane_type;	/* type of plane: DRM_PLANE_TYPE_* */
 	/* out */
 	__u32 drm_format;	/* drm format of plane */
-	__u64 drm_format_mod;   /* tiled mode */
+	__aligned_u64 drm_format_mod;   /* tiled mode */
 	__u32 width;	/* width of plane */
 	__u32 height;	/* height of plane */
 	__u32 stride;	/* stride of plane */
@@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info {
 		__u32 region_index;	/* region index */
 		__u32 dmabuf_id;	/* dma-buf id */
 	};
+	__u32 reserved;
 };
 
 #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14)
diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 3764d1911b51..93405264ff23 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -1262,7 +1262,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
 
 	case VFIO_DEVICE_QUERY_GFX_PLANE:
 	{
-		struct vfio_device_gfx_plane_info plane;
+		struct vfio_device_gfx_plane_info plane = {};
 
 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
 				    region_index);
diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c
index 064e1c0a7aa8..72ea5832c927 100644
--- a/samples/vfio-mdev/mdpy.c
+++ b/samples/vfio-mdev/mdpy.c
@@ -591,7 +591,7 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
 
 	case VFIO_DEVICE_QUERY_GFX_PLANE:
 	{
-		struct vfio_device_gfx_plane_info plane;
+		struct vfio_device_gfx_plane_info plane = {};
 
 		minsz = offsetofend(struct vfio_device_gfx_plane_info,
 				    region_index);

From 61050c73441be7933d2170642c3f3e36313e56c8 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Mon, 18 Sep 2023 16:56:17 -0400
Subject: [PATCH 06/18] vfio: use __aligned_u64 in struct vfio_device_ioeventfd

The memory layout of struct vfio_device_ioeventfd is
architecture-dependent due to a u64 field and a struct size that is not
a multiple of 8 bytes:
- On x86_64 the struct size is padded to a multiple of 8 bytes.
- On x32 the struct size is only a multiple of 4 bytes, not 8.
- Other architectures may vary.

Use __aligned_u64 to make memory layout consistent. This reduces the
chance that 32-bit userspace on a 64-bit kernel breakage.

This patch increases the struct size on x32 but this is safe because of
the struct's argsz field. The kernel may grow the struct as long as it
still supports smaller argsz values from userspace (e.g. applications
compiled against older kernel headers).

The code that uses struct vfio_device_ioeventfd already works correctly
when the struct size grows, so only the struct definition needs to be
changed.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Link: https://lore.kernel.org/r/20230918205617.1478722-4-stefanha@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 8a3f83e0bdc1..7f5fb010226d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -864,9 +864,10 @@ struct vfio_device_ioeventfd {
 #define VFIO_DEVICE_IOEVENTFD_32	(1 << 2) /* 4-byte write */
 #define VFIO_DEVICE_IOEVENTFD_64	(1 << 3) /* 8-byte write */
 #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK	(0xf)
-	__u64	offset;			/* device fd offset of write */
-	__u64	data;			/* data to be written */
+	__aligned_u64	offset;		/* device fd offset of write */
+	__aligned_u64	data;		/* data to be written */
 	__s32	fd;			/* -1 for de-assignment */
+	__u32	reserved;
 };
 
 #define VFIO_DEVICE_IOEVENTFD		_IO(VFIO_TYPE, VFIO_BASE + 16)

From 82470eba9df97a021309b360ec48342deaa3c7af Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:49 +0300
Subject: [PATCH 07/18] vfio/mlx5: Wake up the reader post of disabling the
 SAVING migration file

Post of disabling the SAVING migration file, which includes setting the
file state to be MLX5_MIGF_STATE_ERROR, call to wake_up_interruptible()
on its poll_wait member.

This lets any potential reader which is waiting already for data as part
of mlx5vf_save_read() to wake up, recognize the error state and return
with an error.

Post of that we don't need to rely on any other condition to wake up
the reader as of the returning of the SAVE command that was previously
executed, etc.

In addition, this change will simplify error flows (e.g health recovery)
once we'll move to chunk mode and multiple SAVE commands may run in the
STOP_COPY phase as we won't need to rely any more on a SAVE command to
wake-up a potential waiting reader.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-3-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 42ec574a8622..2556d5455692 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -1019,6 +1019,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
 		mlx5vf_disable_fd(mvdev->saving_migf);
+		wake_up_interruptible(&mvdev->saving_migf->poll_wait);
 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
 		fput(mvdev->saving_migf->filp);
 		mvdev->saving_migf = NULL;

From 34a64c8eac2380a28eeab191dc5bf4e0dbadf7c6 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:50 +0300
Subject: [PATCH 08/18] vfio/mlx5: Refactor the SAVE callback to activate a
 work only upon an error

Upon a successful SAVE callback there is no need to activate a work, all
the required stuff can be done directly.

As so, refactor the above flow to activate a work only upon an error.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-4-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 33574b04477d..18d9d1768066 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -475,6 +475,15 @@ found:
 	return buf;
 }
 
+static void
+mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
+			      struct mlx5vf_async_data *async_data)
+{
+	kvfree(async_data->out);
+	complete(&migf->save_comp);
+	fput(migf->filp);
+}
+
 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
 {
 	struct mlx5vf_async_data *async_data = container_of(_work,
@@ -494,9 +503,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
 		wake_up_interruptible(&migf->poll_wait);
 	}
 	mutex_unlock(&migf->lock);
-	kvfree(async_data->out);
-	complete(&migf->save_comp);
-	fput(migf->filp);
+	mlx5vf_save_callback_complete(migf, async_data);
 }
 
 static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
@@ -560,13 +567,12 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 		migf->state = async_data->last_chunk ?
 			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
 		wake_up_interruptible(&migf->poll_wait);
+		mlx5vf_save_callback_complete(migf, async_data);
+		return;
 	}
 
 err:
-	/*
-	 * The error and the cleanup flows can't run from an
-	 * interrupt context
-	 */
+	/* The error flow can't run from an interrupt context */
 	if (status == -EREMOTEIO)
 		status = MLX5_GET(save_vhca_state_out, async_data->out, status);
 	async_data->status = status;

From 543640af84802e84f1e38ee102882d6ae1354701 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:51 +0300
Subject: [PATCH 09/18] vfio/mlx5: Enable querying state size which is > 4GB

Once the device supports 'chunk mode' the driver can support state size
which is larger than 4GB.

In that case the device has the capability to split a single image to
multiple chunks as long as the software provides a buffer in the minimum
size reported by the device.

The driver should query for the minimum buffer size required using
QUERY_VHCA_MIGRATION_STATE command with the 'chunk' bit set in its
input, in that case, the output will include both the minimum buffer
size (i.e.  required_umem_size) and also the remaining total size to be
reported/used where that it will be applicable.

At that point in the series the 'chunk' bit is off, the last patch will
activate the feature once all pieces will be ready.

Note:
Before this change we were limited to 4GB state size as of 4 bytes max
value based on the device specification for the query/save/load
commands.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-5-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |  9 ++++++++-
 drivers/vfio/pci/mlx5/cmd.h  |  4 +++-
 drivers/vfio/pci/mlx5/main.c | 13 +++++++------
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 18d9d1768066..e70d84bf2043 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
 }
 
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
-					  size_t *state_size, u8 query_flags)
+					  size_t *state_size, u64 *total_size,
+					  u8 query_flags)
 {
 	u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
 	u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 	MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
 	MLX5_SET(query_vhca_migration_state_in, in, incremental,
 		 query_flags & MLX5VF_QUERY_INC);
+	MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
 
 	ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
 				  out);
@@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
 
 	*state_size = MLX5_GET(query_vhca_migration_state_out, out,
 			       required_umem_size);
+	if (total_size)
+		*total_size = mvdev->chunk_mode ?
+			MLX5_GET64(query_vhca_migration_state_out, out,
+				   remaining_total_size) : *state_size;
+
 	return 0;
 }
 
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index aec4c69dd6c1..4fb37598c8e5 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -164,6 +164,7 @@ struct mlx5vf_pci_core_device {
 	u8 deferred_reset:1;
 	u8 mdev_detach:1;
 	u8 log_active:1;
+	u8 chunk_mode:1;
 	struct completion tracker_comp;
 	/* protect migration state */
 	struct mutex state_mutex;
@@ -186,7 +187,8 @@ enum {
 int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
 int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
-					  size_t *state_size, u8 query_flags);
+					  size_t *state_size, u64 *total_size,
+					  u8 query_flags);
 void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
 			       const struct vfio_migration_ops *mig_ops,
 			       const struct vfio_log_ops *log_ops);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 2556d5455692..90cb36fee6c0 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -428,7 +428,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
 		 * As so, the other code below is safe with the proper locks.
 		 */
 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
-							    MLX5VF_QUERY_INC);
+							    NULL, MLX5VF_QUERY_INC);
 		if (ret)
 			goto err_state_unlock;
 	}
@@ -505,7 +505,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
 	if (migf->state == MLX5_MIGF_STATE_ERROR)
 		return -ENODEV;
 
-	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
 	if (ret)
 		goto err;
@@ -574,7 +574,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	INIT_LIST_HEAD(&migf->buf_list);
 	INIT_LIST_HEAD(&migf->avail_list);
 	spin_lock_init(&migf->list_lock);
-	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 0);
 	if (ret)
 		goto out_pd;
 
@@ -1195,13 +1195,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
 	struct mlx5vf_pci_core_device *mvdev = container_of(
 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
 	size_t state_size;
+	u64 total_size;
 	int ret;
 
 	mutex_lock(&mvdev->state_mutex);
-	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
-						    &state_size, 0);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
+						    &total_size, 0);
 	if (!ret)
-		*stop_copy_length = state_size;
+		*stop_copy_length = total_size;
 	mlx5vf_state_mutex_unlock(mvdev);
 	return ret;
 }

From 9114100d10b3e970857afd0bc2f296e147fc9392 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:52 +0300
Subject: [PATCH 10/18] vfio/mlx5: Rename some stuff to match chunk mode

Upon chunk mode there may be multiple images that will be read from the
device upon STOP_COPY.

This patch is some preparation for that mode by replacing the relevant
stuff to a better matching name.

As part of that, be stricter to recognize PRE_COPY error only when it
didn't occur on a STOP_COPY chunk.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-6-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c | 15 ++++++++-------
 drivers/vfio/pci/mlx5/cmd.h |  4 ++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index e70d84bf2043..7b48a9b80bc6 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -503,7 +503,8 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
 		mlx5vf_put_data_buffer(async_data->buf);
 		if (async_data->header_buf)
 			mlx5vf_put_data_buffer(async_data->header_buf);
-		if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
+		if (!async_data->stop_copy_chunk &&
+		    async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
 			migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
 		else
 			migf->state = MLX5_MIGF_STATE_ERROR;
@@ -553,7 +554,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 		size_t image_size;
 		unsigned long flags;
 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
-				!async_data->last_chunk;
+				!async_data->stop_copy_chunk;
 
 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
 				      actual_image_size);
@@ -571,7 +572,7 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 		spin_unlock_irqrestore(&migf->list_lock, flags);
 		if (initial_pre_copy)
 			migf->pre_copy_initial_bytes += image_size;
-		migf->state = async_data->last_chunk ?
+		migf->state = async_data->stop_copy_chunk ?
 			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
 		wake_up_interruptible(&migf->poll_wait);
 		mlx5vf_save_callback_complete(migf, async_data);
@@ -623,7 +624,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 
 	async_data = &migf->async_data;
 	async_data->buf = buf;
-	async_data->last_chunk = !track;
+	async_data->stop_copy_chunk = !track;
 	async_data->out = kvzalloc(out_size, GFP_KERNEL);
 	if (!async_data->out) {
 		err = -ENOMEM;
@@ -631,7 +632,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	}
 
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
-		if (async_data->last_chunk && migf->buf_header) {
+		if (async_data->stop_copy_chunk && migf->buf_header) {
 			header_buf = migf->buf_header;
 			migf->buf_header = NULL;
 		} else {
@@ -644,8 +645,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 		}
 	}
 
-	if (async_data->last_chunk)
-		migf->state = MLX5_MIGF_STATE_SAVE_LAST;
+	if (async_data->stop_copy_chunk)
+		migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
 
 	async_data->header_buf = header_buf;
 	get_file(migf->filp);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 4fb37598c8e5..ac5dca5fe6b1 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -20,7 +20,7 @@ enum mlx5_vf_migf_state {
 	MLX5_MIGF_STATE_ERROR = 1,
 	MLX5_MIGF_STATE_PRE_COPY_ERROR,
 	MLX5_MIGF_STATE_PRE_COPY,
-	MLX5_MIGF_STATE_SAVE_LAST,
+	MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK,
 	MLX5_MIGF_STATE_COMPLETE,
 };
 
@@ -78,7 +78,7 @@ struct mlx5vf_async_data {
 	struct mlx5_vhca_data_buffer *buf;
 	struct mlx5_vhca_data_buffer *header_buf;
 	int status;
-	u8 last_chunk:1;
+	u8 stop_copy_chunk:1;
 	void *out;
 };
 

From 5798e4dd584f23bb5a4db4c524522c4dd8a84a78 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:53 +0300
Subject: [PATCH 11/18] vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase

This patch is another preparation step towards working in chunk mode.

It pre-allocates chunks for the STOP_COPY phase to let the driver use
them immediately and prevent an extra allocation upon that phase.

Before that patch we had a single large buffer that was dedicated for
the STOP_COPY phase as there was a single SAVE in the source for the
last image.

Once we'll move to chunk mode the idea is to have some small buffers
that will be used upon the STOP_COPY phase.

The driver will read-ahead from the firmware the full state in
small/optimized chunks while letting QEMU/user space read in parallel
the available data.

Each buffer holds its chunk number to let it be recognized down the road
in the coming patches.

The chunk buffer size is picked-up based on the minimum size that
firmware requires, the total full size and some max value in the driver
code which was set to 8MB to achieve some optimized downtime in the
general case.

As the chunk mode is applicable even if we move directly to STOP_COPY
the buffers preparation and some other related stuff is done
unconditionally with regards to STOP/PRE-COPY.

Note:
In that phase in the series we still didn't activate the chunk mode and
the first buffer will be used in all the places.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-7-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c  |  23 +++---
 drivers/vfio/pci/mlx5/cmd.h  |   8 +-
 drivers/vfio/pci/mlx5/main.c | 152 ++++++++++++++++++++++-------------
 3 files changed, 117 insertions(+), 66 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 7b48a9b80bc6..b18735ee5d07 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -632,9 +632,9 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	}
 
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
-		if (async_data->stop_copy_chunk && migf->buf_header) {
-			header_buf = migf->buf_header;
-			migf->buf_header = NULL;
+		if (async_data->stop_copy_chunk && migf->buf_header[0]) {
+			header_buf = migf->buf_header[0];
+			migf->buf_header[0] = NULL;
 		} else {
 			header_buf = mlx5vf_get_data_buffer(migf,
 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -721,18 +721,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
 void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
 {
 	struct mlx5_vhca_data_buffer *entry;
+	int i;
 
 	lockdep_assert_held(&migf->mvdev->state_mutex);
 	WARN_ON(migf->mvdev->mdev_detach);
 
-	if (migf->buf) {
-		mlx5vf_free_data_buffer(migf->buf);
-		migf->buf = NULL;
-	}
+	for (i = 0; i < MAX_NUM_CHUNKS; i++) {
+		if (migf->buf[i]) {
+			mlx5vf_free_data_buffer(migf->buf[i]);
+			migf->buf[i] = NULL;
+		}
 
-	if (migf->buf_header) {
-		mlx5vf_free_data_buffer(migf->buf_header);
-		migf->buf_header = NULL;
+		if (migf->buf_header[i]) {
+			mlx5vf_free_data_buffer(migf->buf_header[i]);
+			migf->buf_header[i] = NULL;
+		}
 	}
 
 	list_splice(&migf->avail_list, &migf->buf_list);
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index ac5dca5fe6b1..6d8d52804c83 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
 	u32 mkey;
 	enum dma_data_direction dma_dir;
 	u8 dmaed:1;
+	u8 stop_copy_chunk_num;
 	struct list_head buf_elm;
 	struct mlx5_vf_migration_file *migf;
 	/* Optimize mlx5vf_get_migration_page() for sequential access */
@@ -82,6 +83,8 @@ struct mlx5vf_async_data {
 	void *out;
 };
 
+#define MAX_NUM_CHUNKS 2
+
 struct mlx5_vf_migration_file {
 	struct file *filp;
 	struct mutex lock;
@@ -94,8 +97,9 @@ struct mlx5_vf_migration_file {
 	u32 record_tag;
 	u64 stop_copy_prep_size;
 	u64 pre_copy_initial_bytes;
-	struct mlx5_vhca_data_buffer *buf;
-	struct mlx5_vhca_data_buffer *buf_header;
+	/* Upon chunk mode preserve another set of buffers for stop_copy phase */
+	struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
+	struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
 	spinlock_t list_lock;
 	struct list_head buf_list;
 	struct list_head avail_list;
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 90cb36fee6c0..351b61303b72 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -24,6 +24,8 @@
 /* Device specification max LOAD size */
 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
 
+#define MAX_CHUNK_SIZE SZ_8M
+
 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
 {
 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@@ -304,7 +306,8 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
-static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
+static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
+				       bool track)
 {
 	size_t size = sizeof(struct mlx5_vf_migration_header) +
 		sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@@ -331,7 +334,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
 	to_buff = kmap_local_page(page);
 	memcpy(to_buff, &header, sizeof(header));
 	header_buf->length = sizeof(header);
-	data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
+	data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
 	memcpy(to_buff + sizeof(header), &data, sizeof(data));
 	header_buf->length += sizeof(data);
 	kunmap_local(to_buff);
@@ -340,48 +343,83 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
 	spin_lock_irqsave(&migf->list_lock, flags);
 	list_add_tail(&header_buf->buf_elm, &migf->buf_list);
 	spin_unlock_irqrestore(&migf->list_lock, flags);
-	migf->pre_copy_initial_bytes = size;
+	if (track)
+		migf->pre_copy_initial_bytes = size;
 	return 0;
 err:
 	mlx5vf_put_data_buffer(header_buf);
 	return ret;
 }
 
-static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
-				 size_t state_size)
+static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
+				 struct mlx5_vf_migration_file *migf,
+				 size_t state_size, u64 full_size,
+				 bool track)
 {
 	struct mlx5_vhca_data_buffer *buf;
 	size_t inc_state_size;
+	int num_chunks;
 	int ret;
+	int i;
 
-	/* let's be ready for stop_copy size that might grow by 10 percents */
-	if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
-		inc_state_size = state_size;
+	if (mvdev->chunk_mode) {
+		size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
 
-	buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
-	if (IS_ERR(buf))
-		return PTR_ERR(buf);
-
-	migf->buf = buf;
-	buf = mlx5vf_get_data_buffer(migf,
-			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
-	if (IS_ERR(buf)) {
-		ret = PTR_ERR(buf);
-		goto err;
+		/* from firmware perspective at least 'state_size' buffer should be set */
+		inc_state_size = max(state_size, chunk_size);
+	} else {
+		if (track) {
+			/* let's be ready for stop_copy size that might grow by 10 percents */
+			if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
+				inc_state_size = state_size;
+		} else {
+			inc_state_size = state_size;
+		}
 	}
 
-	migf->buf_header = buf;
-	ret = mlx5vf_add_stop_copy_header(migf);
+	/* let's not overflow the device specification max SAVE size */
+	inc_state_size = min_t(size_t, inc_state_size,
+		(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
+
+	num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
+	for (i = 0; i < num_chunks; i++) {
+		buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
+
+		migf->buf[i] = buf;
+		buf = mlx5vf_get_data_buffer(migf,
+				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
+			goto err;
+		}
+		migf->buf_header[i] = buf;
+		if (mvdev->chunk_mode) {
+			migf->buf[i]->stop_copy_chunk_num = i + 1;
+			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+		}
+	}
+
+	ret = mlx5vf_add_stop_copy_header(migf, track);
 	if (ret)
-		goto err_header;
+		goto err;
 	return 0;
 
-err_header:
-	mlx5vf_put_data_buffer(migf->buf_header);
-	migf->buf_header = NULL;
 err:
-	mlx5vf_put_data_buffer(migf->buf);
-	migf->buf = NULL;
+	for (i = 0; i < num_chunks; i++) {
+		if (migf->buf[i]) {
+			mlx5vf_put_data_buffer(migf->buf[i]);
+			migf->buf[i] = NULL;
+		}
+		if (migf->buf_header[i]) {
+			mlx5vf_put_data_buffer(migf->buf_header[i]);
+			migf->buf_header[i] = NULL;
+		}
+	}
+
 	return ret;
 }
 
@@ -511,9 +549,9 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
 		goto err;
 
 	/* Checking whether we have a matching pre-allocated buffer that can fit */
-	if (migf->buf && migf->buf->allocated_length >= length) {
-		buf = migf->buf;
-		migf->buf = NULL;
+	if (migf->buf[0]->allocated_length >= length) {
+		buf = migf->buf[0];
+		migf->buf[0] = NULL;
 	} else {
 		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
 		if (IS_ERR(buf)) {
@@ -541,6 +579,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	struct mlx5_vf_migration_file *migf;
 	struct mlx5_vhca_data_buffer *buf;
 	size_t length;
+	u64 full_size;
 	int ret;
 
 	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@@ -574,20 +613,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
 	INIT_LIST_HEAD(&migf->buf_list);
 	INIT_LIST_HEAD(&migf->avail_list);
 	spin_lock_init(&migf->list_lock);
-	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, 0);
+	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
+	if (ret)
+		goto out_pd;
+
+	ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
 	if (ret)
 		goto out_pd;
 
 	if (track) {
-		ret = mlx5vf_prep_stop_copy(migf, length);
-		if (ret)
+		/* leave the allocated buffer ready for the stop-copy phase */
+		buf = mlx5vf_alloc_data_buffer(migf,
+			migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+		if (IS_ERR(buf)) {
+			ret = PTR_ERR(buf);
 			goto out_pd;
-	}
-
-	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
-	if (IS_ERR(buf)) {
-		ret = PTR_ERR(buf);
-		goto out_pd;
+		}
+	} else {
+		buf = migf->buf[0];
+		migf->buf[0] = NULL;
 	}
 
 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@@ -820,8 +864,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 				   size_t len, loff_t *pos)
 {
 	struct mlx5_vf_migration_file *migf = filp->private_data;
-	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
-	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
+	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
+	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
 	loff_t requested_length;
 	bool has_work = false;
 	ssize_t done = 0;
@@ -856,15 +900,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 			if (vhca_buf_header->allocated_length < migf->record_size) {
 				mlx5vf_free_data_buffer(vhca_buf_header);
 
-				migf->buf_header = mlx5vf_alloc_data_buffer(migf,
+				migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
 						migf->record_size, DMA_NONE);
-				if (IS_ERR(migf->buf_header)) {
-					ret = PTR_ERR(migf->buf_header);
-					migf->buf_header = NULL;
+				if (IS_ERR(migf->buf_header[0])) {
+					ret = PTR_ERR(migf->buf_header[0]);
+					migf->buf_header[0] = NULL;
 					goto out_unlock;
 				}
 
-				vhca_buf_header = migf->buf_header;
+				vhca_buf_header = migf->buf_header[0];
 			}
 
 			vhca_buf_header->start_pos = migf->max_pos;
@@ -884,15 +928,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
 			if (vhca_buf->allocated_length < size) {
 				mlx5vf_free_data_buffer(vhca_buf);
 
-				migf->buf = mlx5vf_alloc_data_buffer(migf,
+				migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
 							size, DMA_TO_DEVICE);
-				if (IS_ERR(migf->buf)) {
-					ret = PTR_ERR(migf->buf);
-					migf->buf = NULL;
+				if (IS_ERR(migf->buf[0])) {
+					ret = PTR_ERR(migf->buf[0]);
+					migf->buf[0] = NULL;
 					goto out_unlock;
 				}
 
-				vhca_buf = migf->buf;
+				vhca_buf = migf->buf[0];
 			}
 
 			vhca_buf->start_pos = migf->max_pos;
@@ -974,7 +1018,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 		goto out_pd;
 	}
 
-	migf->buf = buf;
+	migf->buf[0] = buf;
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
 		buf = mlx5vf_alloc_data_buffer(migf,
 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@@ -983,7 +1027,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 			goto out_buf;
 		}
 
-		migf->buf_header = buf;
+		migf->buf_header[0] = buf;
 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
 	} else {
 		/* Initial state will be to read the image */
@@ -997,7 +1041,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
 	spin_lock_init(&migf->list_lock);
 	return migf;
 out_buf:
-	mlx5vf_free_data_buffer(migf->buf);
+	mlx5vf_free_data_buffer(migf->buf[0]);
 out_pd:
 	mlx5vf_cmd_dealloc_pd(migf);
 out_free:
@@ -1101,7 +1145,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
 		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
 			ret = mlx5vf_cmd_load_vhca_state(mvdev,
 							 mvdev->resuming_migf,
-							 mvdev->resuming_migf->buf);
+							 mvdev->resuming_migf->buf[0]);
 			if (ret)
 				return ERR_PTR(ret);
 		}

From 67135f29451ef9dc56ff0415705d667267c4d9d2 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:54 +0300
Subject: [PATCH 12/18] vfio/mlx5: Add support for SAVING in chunk mode

Add support for SAVING in chunk mode, it includes running a work
that will fill the next chunk from the device.

In case the number of available chunks will reach the MAX_NUM_CHUNKS,
the next chunk SAVING will be delayed till the reader will consume one
chunk.

The next patch from the series will add the reader part of the chunk
mode.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-8-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c  | 43 +++++++++++++++---
 drivers/vfio/pci/mlx5/cmd.h  | 12 ++++++
 drivers/vfio/pci/mlx5/main.c | 84 +++++++++++++++++++++++++++++++-----
 3 files changed, 122 insertions(+), 17 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index b18735ee5d07..e68bf9ba5300 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -435,6 +435,7 @@ end:
 void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
 {
 	spin_lock_irq(&buf->migf->list_lock);
+	buf->stop_copy_chunk_num = 0;
 	list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
 	spin_unlock_irq(&buf->migf->list_lock);
 }
@@ -551,6 +552,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 			struct mlx5_vf_migration_file, async_data);
 
 	if (!status) {
+		size_t next_required_umem_size = 0;
+		bool stop_copy_last_chunk;
 		size_t image_size;
 		unsigned long flags;
 		bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
@@ -558,6 +561,11 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 
 		image_size = MLX5_GET(save_vhca_state_out, async_data->out,
 				      actual_image_size);
+		if (async_data->buf->stop_copy_chunk_num)
+			next_required_umem_size = MLX5_GET(save_vhca_state_out,
+					async_data->out, next_required_umem_size);
+		stop_copy_last_chunk = async_data->stop_copy_chunk &&
+				!next_required_umem_size;
 		if (async_data->header_buf) {
 			status = add_buf_header(async_data->header_buf, image_size,
 						initial_pre_copy);
@@ -569,12 +577,28 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
 		migf->max_pos += async_data->buf->length;
 		spin_lock_irqsave(&migf->list_lock, flags);
 		list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
+		if (async_data->buf->stop_copy_chunk_num) {
+			migf->num_ready_chunks++;
+			if (next_required_umem_size &&
+			    migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
+				/* Delay the next SAVE till one chunk be consumed */
+				migf->next_required_umem_size = next_required_umem_size;
+				next_required_umem_size = 0;
+			}
+		}
 		spin_unlock_irqrestore(&migf->list_lock, flags);
-		if (initial_pre_copy)
+		if (initial_pre_copy) {
 			migf->pre_copy_initial_bytes += image_size;
-		migf->state = async_data->stop_copy_chunk ?
-			MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
+			migf->state = MLX5_MIGF_STATE_PRE_COPY;
+		}
+		if (stop_copy_last_chunk)
+			migf->state = MLX5_MIGF_STATE_COMPLETE;
 		wake_up_interruptible(&migf->poll_wait);
+		if (next_required_umem_size)
+			mlx5vf_mig_file_set_save_work(migf,
+				/* Picking up the next chunk num */
+				(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
+				next_required_umem_size);
 		mlx5vf_save_callback_complete(migf, async_data);
 		return;
 	}
@@ -632,10 +656,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
 	}
 
 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
-		if (async_data->stop_copy_chunk && migf->buf_header[0]) {
-			header_buf = migf->buf_header[0];
-			migf->buf_header[0] = NULL;
-		} else {
+		if (async_data->stop_copy_chunk) {
+			u8 header_idx = buf->stop_copy_chunk_num ?
+				buf->stop_copy_chunk_num - 1 : 0;
+
+			header_buf = migf->buf_header[header_idx];
+			migf->buf_header[header_idx] = NULL;
+		}
+
+		if (!header_buf) {
 			header_buf = mlx5vf_get_data_buffer(migf,
 				sizeof(struct mlx5_vf_migration_header), DMA_NONE);
 			if (IS_ERR(header_buf)) {
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 6d8d52804c83..f2c7227fa683 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -83,6 +83,13 @@ struct mlx5vf_async_data {
 	void *out;
 };
 
+struct mlx5vf_save_work_data {
+	struct mlx5_vf_migration_file *migf;
+	size_t next_required_umem_size;
+	struct work_struct work;
+	u8 chunk_num;
+};
+
 #define MAX_NUM_CHUNKS 2
 
 struct mlx5_vf_migration_file {
@@ -97,9 +104,12 @@ struct mlx5_vf_migration_file {
 	u32 record_tag;
 	u64 stop_copy_prep_size;
 	u64 pre_copy_initial_bytes;
+	size_t next_required_umem_size;
+	u8 num_ready_chunks;
 	/* Upon chunk mode preserve another set of buffers for stop_copy phase */
 	struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
 	struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
+	struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS];
 	spinlock_t list_lock;
 	struct list_head buf_list;
 	struct list_head avail_list;
@@ -223,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
 void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+				   u8 chunk_num, size_t next_required_umem_size);
 int mlx5vf_start_page_tracker(struct vfio_device *vdev,
 		struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
 int mlx5vf_stop_page_tracker(struct vfio_device *vdev);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 351b61303b72..c80caf55499f 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -306,6 +306,73 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
 	wake_up_interruptible(&migf->poll_wait);
 }
 
+void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
+				   u8 chunk_num, size_t next_required_umem_size)
+{
+	migf->save_data[chunk_num - 1].next_required_umem_size =
+			next_required_umem_size;
+	migf->save_data[chunk_num - 1].migf = migf;
+	get_file(migf->filp);
+	queue_work(migf->mvdev->cb_wq,
+		   &migf->save_data[chunk_num - 1].work);
+}
+
+static struct mlx5_vhca_data_buffer *
+mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
+				  u8 index, size_t required_length)
+{
+	struct mlx5_vhca_data_buffer *buf = migf->buf[index];
+	u8 chunk_num;
+
+	WARN_ON(!buf);
+	chunk_num = buf->stop_copy_chunk_num;
+	buf->migf->buf[index] = NULL;
+	/* Checking whether the pre-allocated buffer can fit */
+	if (buf->allocated_length >= required_length)
+		return buf;
+
+	mlx5vf_put_data_buffer(buf);
+	buf = mlx5vf_get_data_buffer(buf->migf, required_length,
+				     DMA_FROM_DEVICE);
+	if (IS_ERR(buf))
+		return buf;
+
+	buf->stop_copy_chunk_num = chunk_num;
+	return buf;
+}
+
+static void mlx5vf_mig_file_save_work(struct work_struct *_work)
+{
+	struct mlx5vf_save_work_data *save_data = container_of(_work,
+		struct mlx5vf_save_work_data, work);
+	struct mlx5_vf_migration_file *migf = save_data->migf;
+	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
+	struct mlx5_vhca_data_buffer *buf;
+
+	mutex_lock(&mvdev->state_mutex);
+	if (migf->state == MLX5_MIGF_STATE_ERROR)
+		goto end;
+
+	buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
+				save_data->chunk_num - 1,
+				save_data->next_required_umem_size);
+	if (IS_ERR(buf))
+		goto err;
+
+	if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
+		goto err_save;
+
+	goto end;
+
+err_save:
+	mlx5vf_put_data_buffer(buf);
+err:
+	mlx5vf_mark_err(migf);
+end:
+	mlx5vf_state_mutex_unlock(mvdev);
+	fput(migf->filp);
+}
+
 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
 				       bool track)
 {
@@ -400,6 +467,9 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
 		if (mvdev->chunk_mode) {
 			migf->buf[i]->stop_copy_chunk_num = i + 1;
 			migf->buf_header[i]->stop_copy_chunk_num = i + 1;
+			INIT_WORK(&migf->save_data[i].work,
+				  mlx5vf_mig_file_save_work);
+			migf->save_data[i].chunk_num = i + 1;
 		}
 	}
 
@@ -548,16 +618,10 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
 	if (ret)
 		goto err;
 
-	/* Checking whether we have a matching pre-allocated buffer that can fit */
-	if (migf->buf[0]->allocated_length >= length) {
-		buf = migf->buf[0];
-		migf->buf[0] = NULL;
-	} else {
-		buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
-		if (IS_ERR(buf)) {
-			ret = PTR_ERR(buf);
-			goto err;
-		}
+	buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
+	if (IS_ERR(buf)) {
+		ret = PTR_ERR(buf);
+		goto err;
 	}
 
 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);

From a899cacab56b0dc9a94b7c02007d2f0b2076b18e Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:55 +0300
Subject: [PATCH 13/18] vfio/mlx5: Add support for READING in chunk mode

Add support for READING in chunk mode.

In case the last SAVE command recognized that there was still some image
to be read, however, there was no available chunk to use for, this task
was delayed for the reader till one chunk will be consumed and becomes
available.

In the above case, a work will be executed to read in the background the
next image from the device.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-9-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/main.c | 43 +++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 6 deletions(-)

diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index c80caf55499f..b6ac66c5008d 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -160,6 +160,41 @@ end:
 	return found ? buf : NULL;
 }
 
+static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
+{
+	struct mlx5_vf_migration_file *migf = vhca_buf->migf;
+
+	if (vhca_buf->stop_copy_chunk_num) {
+		bool is_header = vhca_buf->dma_dir == DMA_NONE;
+		u8 chunk_num = vhca_buf->stop_copy_chunk_num;
+		size_t next_required_umem_size = 0;
+
+		if (is_header)
+			migf->buf_header[chunk_num - 1] = vhca_buf;
+		else
+			migf->buf[chunk_num - 1] = vhca_buf;
+
+		spin_lock_irq(&migf->list_lock);
+		list_del_init(&vhca_buf->buf_elm);
+		if (!is_header) {
+			next_required_umem_size =
+				migf->next_required_umem_size;
+			migf->next_required_umem_size = 0;
+			migf->num_ready_chunks--;
+		}
+		spin_unlock_irq(&migf->list_lock);
+		if (next_required_umem_size)
+			mlx5vf_mig_file_set_save_work(migf, chunk_num,
+						      next_required_umem_size);
+		return;
+	}
+
+	spin_lock_irq(&migf->list_lock);
+	list_del_init(&vhca_buf->buf_elm);
+	list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
+	spin_unlock_irq(&migf->list_lock);
+}
+
 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
 			       char __user **buf, size_t *len, loff_t *pos)
 {
@@ -195,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
 		copy_len -= page_len;
 	}
 
-	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
-		spin_lock_irq(&vhca_buf->migf->list_lock);
-		list_del_init(&vhca_buf->buf_elm);
-		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
-		spin_unlock_irq(&vhca_buf->migf->list_lock);
-	}
+	if (*pos >= vhca_buf->start_pos + vhca_buf->length)
+		mlx5vf_buf_read_done(vhca_buf);
 
 	return done;
 }

From fcb2f2ed4a80cfe383d87da75caba958516507e9 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Mon, 11 Sep 2023 12:38:56 +0300
Subject: [PATCH 14/18] vfio/mlx5: Activate the chunk mode functionality

Now that all pieces are in place, activate the chunk mode functionality
based on device capabilities.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20230911093856.81910-10-yishaih@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/mlx5/cmd.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index e68bf9ba5300..efd1d252cdc9 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -261,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
 		mvdev->core_device.vdev.migration_flags |=
 			VFIO_MIGRATION_PRE_COPY;
 
+	if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
+		mvdev->chunk_mode = 1;
+
 end:
 	mlx5_vf_put_core_dev(mvdev->mdev);
 }

From f9af5ad0f5b599fadf6fc7c9c2153d7919c7691e Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Mon, 2 Oct 2023 10:53:13 -0700
Subject: [PATCH 15/18] vfio/cdx: Add parentheses between bitwise AND
 expression and logical NOT
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When building with clang, there is a warning (or error with
CONFIG_WERROR=y) due to a bitwise AND and logical NOT in
vfio_cdx_bm_ctrl():

  drivers/vfio/cdx/main.c:77:6: error: logical not is only applied to the left hand side of this bitwise operator [-Werror,-Wlogical-not-parentheses]
     77 |         if (!vdev->flags & BME_SUPPORT)
        |             ^            ~
  drivers/vfio/cdx/main.c:77:6: note: add parentheses after the '!' to evaluate the bitwise operator first
     77 |         if (!vdev->flags & BME_SUPPORT)
        |             ^
        |              (                        )
  drivers/vfio/cdx/main.c:77:6: note: add parentheses around left hand side expression to silence this warning
     77 |         if (!vdev->flags & BME_SUPPORT)
        |             ^
        |             (           )
  1 error generated.

Add the parentheses as suggested in the first note, which is clearly
what was intended here.

Closes: https://github.com/ClangBuiltLinux/linux/issues/1939
Fixes: 8a97ab9b8b31 ("vfio-cdx: add bus mastering device feature support")
Signed-off-by: Nathan Chancellor <nathan@kernel.org>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Acked-by: Nikhil Agarwal <nikhil.agarwal@amd.com>
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
Link: https://lore.kernel.org/r/20231002-vfio-cdx-logical-not-parentheses-v1-1-a8846c7adfb6@kernel.org
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/cdx/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index a437630be354..a63744302b5e 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -74,7 +74,7 @@ static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
 	struct vfio_device_feature_bus_master ops;
 	int ret;
 
-	if (!vdev->flags & BME_SUPPORT)
+	if (!(vdev->flags & BME_SUPPORT))
 		return -ENOTTY;
 
 	ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,

From bd885fcf28da7da6a2244d30cd45630303c781ed Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 2 Oct 2023 16:43:25 -0600
Subject: [PATCH 16/18] vfio: Fix smatch errors in vfio_combine_iova_ranges()

smatch reports:

vfio_combine_iova_ranges() error: uninitialized symbol 'last'.
vfio_combine_iova_ranges() error: potentially dereferencing uninitialized 'comb_end'.
vfio_combine_iova_ranges() error: potentially dereferencing uninitialized 'comb_start'.

These errors are only reachable via invalid input, in the case of
@last when we receive an empty rb-tree or for @comb_{start,end} if the
rb-tree is empty or otherwise fails to produce a second node that
reduces the gap.  Add tests with warnings for these cases.

Reported-by: Cong Liu <liucong2@kylinos.cn>
Link: https://lore.kernel.org/all/20230920095532.88135-1-liucong2@kylinos.cn
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Brett Creeley <brett.creeley@amd.com>
Link: https://lore.kernel.org/r/20231002224325.3150842-1-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_main.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 40732e8ed4c6..e31e1952d7b8 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 		unsigned long last;
 
 		comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
+
+		/* Empty list */
+		if (WARN_ON_ONCE(!comb_start))
+			return;
+
 		curr = comb_start;
 		while (curr) {
 			last = curr->last;
@@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
 			prev = curr;
 			curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
 		}
+
+		/* Empty list or no nodes to combine */
+		if (WARN_ON_ONCE(min_gap == ULONG_MAX))
+			break;
+
 		comb_start->last = comb_end->last;
 		interval_tree_remove(comb_end, root);
 		cur_nodes--;

From 293fbc28818135743f54d46c418ede3e4a20a742 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 16 Oct 2023 16:47:35 -0600
Subject: [PATCH 17/18] vfio/mtty: Overhaul mtty interrupt handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mtty driver does not currently conform to the vfio SET_IRQS uAPI.
For example, it claims to support mask and unmask of INTx, but actually
does nothing.  It claims to support AUTOMASK for INTx, but doesn't.  It
fails to teardown eventfds under the full semantics specified by the
SET_IRQS ioctl.  It also fails to teardown eventfds when the device is
closed, leading to memory leaks.  It claims to support the request IRQ,
but doesn't.

Fix all these.

A side effect of this is that QEMU will now report a warning:

vfio <uuid>: Failed to set up UNMASK eventfd signaling for interrupt \
INTX-0: VFIO_DEVICE_SET_IRQS failure: Inappropriate ioctl for device

The fact is that the unmask eventfd was never supported but quietly
failed.  mtty never honored the AUTOMASK behavior, therefore there
was nothing to unmask.  QEMU is verbose about the failure, but
properly falls back to userspace unmasking.

Fixes: 9d1a546c53b4 ("docs: Sample driver to demonstrate how to use Mediated device framework.")
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/r/20231016224736.2575718-2-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 samples/vfio-mdev/mtty.c | 237 +++++++++++++++++++++++++++------------
 1 file changed, 165 insertions(+), 72 deletions(-)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 5af00387c519..245db52bedf2 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -127,7 +127,6 @@ struct serial_port {
 /* State of each mdev device */
 struct mdev_state {
 	struct vfio_device vdev;
-	int irq_fd;
 	struct eventfd_ctx *intx_evtfd;
 	struct eventfd_ctx *msi_evtfd;
 	int irq_index;
@@ -141,6 +140,7 @@ struct mdev_state {
 	struct mutex rxtx_lock;
 	struct vfio_device_info dev_info;
 	int nr_ports;
+	u8 intx_mask:1;
 };
 
 static struct mtty_type {
@@ -166,10 +166,6 @@ static const struct file_operations vd_fops = {
 
 static const struct vfio_device_ops mtty_dev_ops;
 
-/* function prototypes */
-
-static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
-
 /* Helper functions */
 
 static void dump_buffer(u8 *buf, uint32_t count)
@@ -186,6 +182,36 @@ static void dump_buffer(u8 *buf, uint32_t count)
 #endif
 }
 
+static bool is_intx(struct mdev_state *mdev_state)
+{
+	return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX;
+}
+
+static bool is_msi(struct mdev_state *mdev_state)
+{
+	return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX;
+}
+
+static bool is_noirq(struct mdev_state *mdev_state)
+{
+	return !is_intx(mdev_state) && !is_msi(mdev_state);
+}
+
+static void mtty_trigger_interrupt(struct mdev_state *mdev_state)
+{
+	lockdep_assert_held(&mdev_state->ops_lock);
+
+	if (is_msi(mdev_state)) {
+		if (mdev_state->msi_evtfd)
+			eventfd_signal(mdev_state->msi_evtfd, 1);
+	} else if (is_intx(mdev_state)) {
+		if (mdev_state->intx_evtfd && !mdev_state->intx_mask) {
+			eventfd_signal(mdev_state->intx_evtfd, 1);
+			mdev_state->intx_mask = true;
+		}
+	}
+}
+
 static void mtty_create_config_space(struct mdev_state *mdev_state)
 {
 	/* PCI dev ID */
@@ -921,6 +947,25 @@ write_err:
 	return -EFAULT;
 }
 
+static void mtty_disable_intx(struct mdev_state *mdev_state)
+{
+	if (mdev_state->intx_evtfd) {
+		eventfd_ctx_put(mdev_state->intx_evtfd);
+		mdev_state->intx_evtfd = NULL;
+		mdev_state->intx_mask = false;
+		mdev_state->irq_index = -1;
+	}
+}
+
+static void mtty_disable_msi(struct mdev_state *mdev_state)
+{
+	if (mdev_state->msi_evtfd) {
+		eventfd_ctx_put(mdev_state->msi_evtfd);
+		mdev_state->msi_evtfd = NULL;
+		mdev_state->irq_index = -1;
+	}
+}
+
 static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 			 unsigned int index, unsigned int start,
 			 unsigned int count, void *data)
@@ -932,59 +977,113 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 	case VFIO_PCI_INTX_IRQ_INDEX:
 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
 		case VFIO_IRQ_SET_ACTION_MASK:
+			if (!is_intx(mdev_state) || start != 0 || count != 1) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (flags & VFIO_IRQ_SET_DATA_NONE) {
+				mdev_state->intx_mask = true;
+			} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+				uint8_t mask = *(uint8_t *)data;
+
+				if (mask)
+					mdev_state->intx_mask = true;
+			} else if (flags &  VFIO_IRQ_SET_DATA_EVENTFD) {
+				ret = -ENOTTY; /* No support for mask fd */
+			}
+			break;
 		case VFIO_IRQ_SET_ACTION_UNMASK:
+			if (!is_intx(mdev_state) || start != 0 || count != 1) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (flags & VFIO_IRQ_SET_DATA_NONE) {
+				mdev_state->intx_mask = false;
+			} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+				uint8_t mask = *(uint8_t *)data;
+
+				if (mask)
+					mdev_state->intx_mask = false;
+			} else if (flags &  VFIO_IRQ_SET_DATA_EVENTFD) {
+				ret = -ENOTTY; /* No support for unmask fd */
+			}
 			break;
 		case VFIO_IRQ_SET_ACTION_TRIGGER:
-		{
-			if (flags & VFIO_IRQ_SET_DATA_NONE) {
-				pr_info("%s: disable INTx\n", __func__);
-				if (mdev_state->intx_evtfd)
-					eventfd_ctx_put(mdev_state->intx_evtfd);
+			if (is_intx(mdev_state) && !count &&
+			    (flags & VFIO_IRQ_SET_DATA_NONE)) {
+				mtty_disable_intx(mdev_state);
+				break;
+			}
+
+			if (!(is_intx(mdev_state) || is_noirq(mdev_state)) ||
+			    start != 0 || count != 1) {
+				ret = -EINVAL;
 				break;
 			}
 
 			if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
 				int fd = *(int *)data;
+				struct eventfd_ctx *evt;
 
-				if (fd > 0) {
-					struct eventfd_ctx *evt;
+				mtty_disable_intx(mdev_state);
 
-					evt = eventfd_ctx_fdget(fd);
-					if (IS_ERR(evt)) {
-						ret = PTR_ERR(evt);
-						break;
-					}
-					mdev_state->intx_evtfd = evt;
-					mdev_state->irq_fd = fd;
-					mdev_state->irq_index = index;
+				if (fd < 0)
+					break;
+
+				evt = eventfd_ctx_fdget(fd);
+				if (IS_ERR(evt)) {
+					ret = PTR_ERR(evt);
 					break;
 				}
+				mdev_state->intx_evtfd = evt;
+				mdev_state->irq_index = index;
+				break;
+			}
+
+			if (!is_intx(mdev_state)) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (flags & VFIO_IRQ_SET_DATA_NONE) {
+				mtty_trigger_interrupt(mdev_state);
+			} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+				uint8_t trigger = *(uint8_t *)data;
+
+				if (trigger)
+					mtty_trigger_interrupt(mdev_state);
 			}
 			break;
 		}
-		}
 		break;
 	case VFIO_PCI_MSI_IRQ_INDEX:
 		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
 		case VFIO_IRQ_SET_ACTION_MASK:
 		case VFIO_IRQ_SET_ACTION_UNMASK:
+			ret = -ENOTTY;
 			break;
 		case VFIO_IRQ_SET_ACTION_TRIGGER:
-			if (flags & VFIO_IRQ_SET_DATA_NONE) {
-				if (mdev_state->msi_evtfd)
-					eventfd_ctx_put(mdev_state->msi_evtfd);
-				pr_info("%s: disable MSI\n", __func__);
-				mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX;
+			if (is_msi(mdev_state) && !count &&
+			    (flags & VFIO_IRQ_SET_DATA_NONE)) {
+				mtty_disable_msi(mdev_state);
 				break;
 			}
+
+			if (!(is_msi(mdev_state) || is_noirq(mdev_state)) ||
+			    start != 0 || count != 1) {
+				ret = -EINVAL;
+				break;
+			}
+
 			if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
 				int fd = *(int *)data;
 				struct eventfd_ctx *evt;
 
-				if (fd <= 0)
-					break;
+				mtty_disable_msi(mdev_state);
 
-				if (mdev_state->msi_evtfd)
+				if (fd < 0)
 					break;
 
 				evt = eventfd_ctx_fdget(fd);
@@ -993,20 +1092,37 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 					break;
 				}
 				mdev_state->msi_evtfd = evt;
-				mdev_state->irq_fd = fd;
 				mdev_state->irq_index = index;
+				break;
+			}
+
+			if (!is_msi(mdev_state)) {
+				ret = -EINVAL;
+				break;
+			}
+
+			if (flags & VFIO_IRQ_SET_DATA_NONE) {
+				mtty_trigger_interrupt(mdev_state);
+			} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+				uint8_t trigger = *(uint8_t *)data;
+
+				if (trigger)
+					mtty_trigger_interrupt(mdev_state);
 			}
 			break;
-	}
-	break;
+		}
+		break;
 	case VFIO_PCI_MSIX_IRQ_INDEX:
-		pr_info("%s: MSIX_IRQ\n", __func__);
+		dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__);
+		ret = -ENOTTY;
 		break;
 	case VFIO_PCI_ERR_IRQ_INDEX:
-		pr_info("%s: ERR_IRQ\n", __func__);
+		dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__);
+		ret = -ENOTTY;
 		break;
 	case VFIO_PCI_REQ_IRQ_INDEX:
-		pr_info("%s: REQ_IRQ\n", __func__);
+		dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__);
+		ret = -ENOTTY;
 		break;
 	}
 
@@ -1014,33 +1130,6 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
 	return ret;
 }
 
-static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
-{
-	int ret = -1;
-
-	if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) &&
-	    (!mdev_state->msi_evtfd))
-		return -EINVAL;
-	else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) &&
-		 (!mdev_state->intx_evtfd)) {
-		pr_info("%s: Intr eventfd not found\n", __func__);
-		return -EINVAL;
-	}
-
-	if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX)
-		ret = eventfd_signal(mdev_state->msi_evtfd, 1);
-	else
-		ret = eventfd_signal(mdev_state->intx_evtfd, 1);
-
-#if defined(DEBUG_INTR)
-	pr_info("Intx triggered\n");
-#endif
-	if (ret != 1)
-		pr_err("%s: eventfd signal failed (%d)\n", __func__, ret);
-
-	return ret;
-}
-
 static int mtty_get_region_info(struct mdev_state *mdev_state,
 			 struct vfio_region_info *region_info,
 			 u16 *cap_type_id, void **cap_type)
@@ -1084,22 +1173,16 @@ static int mtty_get_region_info(struct mdev_state *mdev_state,
 
 static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
 {
-	switch (irq_info->index) {
-	case VFIO_PCI_INTX_IRQ_INDEX:
-	case VFIO_PCI_MSI_IRQ_INDEX:
-	case VFIO_PCI_REQ_IRQ_INDEX:
-		break;
-
-	default:
+	if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX &&
+	    irq_info->index != VFIO_PCI_MSI_IRQ_INDEX)
 		return -EINVAL;
-	}
 
 	irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
 	irq_info->count = 1;
 
 	if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
-		irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
-				VFIO_IRQ_INFO_AUTOMASKED);
+		irq_info->flags |= VFIO_IRQ_INFO_MASKABLE |
+				   VFIO_IRQ_INFO_AUTOMASKED;
 	else
 		irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
 
@@ -1262,6 +1345,15 @@ static unsigned int mtty_get_available(struct mdev_type *mtype)
 	return atomic_read(&mdev_avail_ports) / type->nr_ports;
 }
 
+static void mtty_close(struct vfio_device *vdev)
+{
+	struct mdev_state *mdev_state =
+				container_of(vdev, struct mdev_state, vdev);
+
+	mtty_disable_intx(mdev_state);
+	mtty_disable_msi(mdev_state);
+}
+
 static const struct vfio_device_ops mtty_dev_ops = {
 	.name = "vfio-mtty",
 	.init = mtty_init_dev,
@@ -1273,6 +1365,7 @@ static const struct vfio_device_ops mtty_dev_ops = {
 	.unbind_iommufd	= vfio_iommufd_emulated_unbind,
 	.attach_ioas	= vfio_iommufd_emulated_attach_ioas,
 	.detach_ioas	= vfio_iommufd_emulated_detach_ioas,
+	.close_device	= mtty_close,
 };
 
 static struct mdev_driver mtty_driver = {

From 2b88119e35b00d8cb418d86abbace3b90a993bd7 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 16 Oct 2023 16:47:36 -0600
Subject: [PATCH 18/18] vfio/mtty: Enable migration support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The mtty driver exposes a PCI serial device to userspace and therefore
makes an easy target for a sample device supporting migration.  The device
does not make use of DMA, therefore we can easily claim support for the
migration P2P states, as well as dirty logging.  This implementation also
makes use of PRE_COPY support in order to provide migration stream
compatibility testing, which should generally be considered good practice.

Reviewed-by: Cédric Le Goater <clg@redhat.com>
Link: https://lore.kernel.org/r/20231016224736.2575718-3-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 samples/vfio-mdev/mtty.c | 590 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 590 insertions(+)

diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c
index 245db52bedf2..69ba0281f9e0 100644
--- a/samples/vfio-mdev/mtty.c
+++ b/samples/vfio-mdev/mtty.c
@@ -29,6 +29,8 @@
 #include <linux/serial.h>
 #include <uapi/linux/serial_reg.h>
 #include <linux/eventfd.h>
+#include <linux/anon_inodes.h>
+
 /*
  * #defines
  */
@@ -124,6 +126,29 @@ struct serial_port {
 	u8 intr_trigger_level;  /* interrupt trigger level */
 };
 
+struct mtty_data {
+	u64 magic;
+#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */
+	u32 major_ver;
+#define MTTY_MAJOR_VER 1
+	u32 minor_ver;
+#define MTTY_MINOR_VER 0
+	u32 nr_ports;
+	u32 flags;
+	struct serial_port ports[2];
+};
+
+struct mdev_state;
+
+struct mtty_migration_file {
+	struct file *filp;
+	struct mutex lock;
+	struct mdev_state *mdev_state;
+	struct mtty_data data;
+	ssize_t filled_size;
+	u8 disabled:1;
+};
+
 /* State of each mdev device */
 struct mdev_state {
 	struct vfio_device vdev;
@@ -140,6 +165,12 @@ struct mdev_state {
 	struct mutex rxtx_lock;
 	struct vfio_device_info dev_info;
 	int nr_ports;
+	enum vfio_device_mig_state state;
+	struct mutex state_mutex;
+	struct mutex reset_mutex;
+	struct mtty_migration_file *saving_migf;
+	struct mtty_migration_file *resuming_migf;
+	u8 deferred_reset:1;
 	u8 intx_mask:1;
 };
 
@@ -743,6 +774,543 @@ accessfailed:
 	return ret;
 }
 
+static size_t mtty_data_size(struct mdev_state *mdev_state)
+{
+	return offsetof(struct mtty_data, ports) +
+		(mdev_state->nr_ports * sizeof(struct serial_port));
+}
+
+static void mtty_disable_file(struct mtty_migration_file *migf)
+{
+	mutex_lock(&migf->lock);
+	migf->disabled = true;
+	migf->filled_size = 0;
+	migf->filp->f_pos = 0;
+	mutex_unlock(&migf->lock);
+}
+
+static void mtty_disable_files(struct mdev_state *mdev_state)
+{
+	if (mdev_state->saving_migf) {
+		mtty_disable_file(mdev_state->saving_migf);
+		fput(mdev_state->saving_migf->filp);
+		mdev_state->saving_migf = NULL;
+	}
+
+	if (mdev_state->resuming_migf) {
+		mtty_disable_file(mdev_state->resuming_migf);
+		fput(mdev_state->resuming_migf->filp);
+		mdev_state->resuming_migf = NULL;
+	}
+}
+
+static void mtty_state_mutex_unlock(struct mdev_state *mdev_state)
+{
+again:
+	mutex_lock(&mdev_state->reset_mutex);
+	if (mdev_state->deferred_reset) {
+		mdev_state->deferred_reset = false;
+		mutex_unlock(&mdev_state->reset_mutex);
+		mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+		mtty_disable_files(mdev_state);
+		goto again;
+	}
+	mutex_unlock(&mdev_state->state_mutex);
+	mutex_unlock(&mdev_state->reset_mutex);
+}
+
+static int mtty_release_migf(struct inode *inode, struct file *filp)
+{
+	struct mtty_migration_file *migf = filp->private_data;
+
+	mtty_disable_file(migf);
+	mutex_destroy(&migf->lock);
+	kfree(migf);
+
+	return 0;
+}
+
+static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd,
+			       unsigned long arg)
+{
+	struct mtty_migration_file *migf = filp->private_data;
+	struct mdev_state *mdev_state = migf->mdev_state;
+	loff_t *pos = &filp->f_pos;
+	struct vfio_precopy_info info = {};
+	unsigned long minsz;
+	int ret;
+
+	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
+		return -ENOTTY;
+
+	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	mutex_lock(&mdev_state->state_mutex);
+	if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY &&
+	    mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	mutex_lock(&migf->lock);
+
+	if (migf->disabled) {
+		mutex_unlock(&migf->lock);
+		ret = -ENODEV;
+		goto unlock;
+	}
+
+	if (*pos > migf->filled_size) {
+		mutex_unlock(&migf->lock);
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	info.dirty_bytes = 0;
+	info.initial_bytes = migf->filled_size - *pos;
+	mutex_unlock(&migf->lock);
+
+	ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
+unlock:
+	mtty_state_mutex_unlock(mdev_state);
+	return ret;
+}
+
+static ssize_t mtty_save_read(struct file *filp, char __user *buf,
+			      size_t len, loff_t *pos)
+{
+	struct mtty_migration_file *migf = filp->private_data;
+	ssize_t ret = 0;
+
+	if (pos)
+		return -ESPIPE;
+
+	pos = &filp->f_pos;
+
+	mutex_lock(&migf->lock);
+
+	dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len);
+
+	if (migf->disabled) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (*pos > migf->filled_size) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	len = min_t(size_t, migf->filled_size - *pos, len);
+	if (len) {
+		if (copy_to_user(buf, (void *)&migf->data + *pos, len)) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		*pos += len;
+		ret = len;
+	}
+out_unlock:
+	dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret);
+	mutex_unlock(&migf->lock);
+	return ret;
+}
+
+static const struct file_operations mtty_save_fops = {
+	.owner = THIS_MODULE,
+	.read = mtty_save_read,
+	.unlocked_ioctl = mtty_precopy_ioctl,
+	.compat_ioctl = compat_ptr_ioctl,
+	.release = mtty_release_migf,
+	.llseek = no_llseek,
+};
+
+static void mtty_save_state(struct mdev_state *mdev_state)
+{
+	struct mtty_migration_file *migf = mdev_state->saving_migf;
+	int i;
+
+	mutex_lock(&migf->lock);
+	for (i = 0; i < mdev_state->nr_ports; i++) {
+		memcpy(&migf->data.ports[i],
+			&mdev_state->s[i], sizeof(struct serial_port));
+		migf->filled_size += sizeof(struct serial_port);
+	}
+	dev_dbg(mdev_state->vdev.dev,
+		"%s filled to %zu\n", __func__, migf->filled_size);
+	mutex_unlock(&migf->lock);
+}
+
+static int mtty_load_state(struct mdev_state *mdev_state)
+{
+	struct mtty_migration_file *migf = mdev_state->resuming_migf;
+	int i;
+
+	mutex_lock(&migf->lock);
+	/* magic and version already tested by resume write fn */
+	if (migf->filled_size < mtty_data_size(mdev_state)) {
+		dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n",
+			__func__, mtty_data_size(mdev_state),
+			migf->filled_size);
+		mutex_unlock(&migf->lock);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < mdev_state->nr_ports; i++)
+		memcpy(&mdev_state->s[i],
+		       &migf->data.ports[i], sizeof(struct serial_port));
+
+	mutex_unlock(&migf->lock);
+	return 0;
+}
+
+static struct mtty_migration_file *
+mtty_save_device_data(struct mdev_state *mdev_state,
+		      enum vfio_device_mig_state state)
+{
+	struct mtty_migration_file *migf = mdev_state->saving_migf;
+	struct mtty_migration_file *ret = NULL;
+
+	if (migf) {
+		if (state == VFIO_DEVICE_STATE_STOP_COPY)
+			goto fill_data;
+		return ret;
+	}
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops,
+					migf, O_RDONLY);
+	if (IS_ERR(migf->filp)) {
+		int rc = PTR_ERR(migf->filp);
+
+		kfree(migf);
+		return ERR_PTR(rc);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	migf->mdev_state = mdev_state;
+
+	migf->data.magic = MTTY_MAGIC;
+	migf->data.major_ver = MTTY_MAJOR_VER;
+	migf->data.minor_ver = MTTY_MINOR_VER;
+	migf->data.nr_ports = mdev_state->nr_ports;
+
+	migf->filled_size = offsetof(struct mtty_data, ports);
+
+	dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n",
+		__func__, migf->filled_size);
+
+	ret = mdev_state->saving_migf = migf;
+
+fill_data:
+	if (state == VFIO_DEVICE_STATE_STOP_COPY)
+		mtty_save_state(mdev_state);
+
+	return ret;
+}
+
+static ssize_t mtty_resume_write(struct file *filp, const char __user *buf,
+				 size_t len, loff_t *pos)
+{
+	struct mtty_migration_file *migf = filp->private_data;
+	struct mdev_state *mdev_state = migf->mdev_state;
+	loff_t requested_length;
+	ssize_t ret = 0;
+
+	if (pos)
+		return -ESPIPE;
+
+	pos = &filp->f_pos;
+
+	if (*pos < 0 ||
+	    check_add_overflow((loff_t)len, *pos, &requested_length))
+		return -EINVAL;
+
+	if (requested_length > mtty_data_size(mdev_state))
+		return -ENOMEM;
+
+	mutex_lock(&migf->lock);
+
+	if (migf->disabled) {
+		ret = -ENODEV;
+		goto out_unlock;
+	}
+
+	if (copy_from_user((void *)&migf->data + *pos, buf, len)) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	*pos += len;
+	ret = len;
+
+	dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n",
+		__func__, len, migf->filled_size + len);
+
+	if (migf->filled_size < offsetof(struct mtty_data, ports) &&
+	    migf->filled_size + len >= offsetof(struct mtty_data, ports)) {
+		if (migf->data.magic != MTTY_MAGIC || migf->data.flags ||
+		    migf->data.major_ver != MTTY_MAJOR_VER ||
+		    migf->data.minor_ver != MTTY_MINOR_VER ||
+		    migf->data.nr_ports != mdev_state->nr_ports) {
+			dev_dbg(migf->mdev_state->vdev.dev,
+				"%s failed validation\n", __func__);
+			ret = -EFAULT;
+		} else {
+			dev_dbg(migf->mdev_state->vdev.dev,
+				"%s header validated\n", __func__);
+		}
+	}
+
+	migf->filled_size += len;
+
+out_unlock:
+	mutex_unlock(&migf->lock);
+	return ret;
+}
+
+static const struct file_operations mtty_resume_fops = {
+	.owner = THIS_MODULE,
+	.write = mtty_resume_write,
+	.release = mtty_release_migf,
+	.llseek = no_llseek,
+};
+
+static struct mtty_migration_file *
+mtty_resume_device_data(struct mdev_state *mdev_state)
+{
+	struct mtty_migration_file *migf;
+	int ret;
+
+	migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
+	if (!migf)
+		return ERR_PTR(-ENOMEM);
+
+	migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops,
+					migf, O_WRONLY);
+	if (IS_ERR(migf->filp)) {
+		ret = PTR_ERR(migf->filp);
+		kfree(migf);
+		return ERR_PTR(ret);
+	}
+
+	stream_open(migf->filp->f_inode, migf->filp);
+	mutex_init(&migf->lock);
+	migf->mdev_state = mdev_state;
+
+	mdev_state->resuming_migf = migf;
+
+	return migf;
+}
+
+static struct file *mtty_step_state(struct mdev_state *mdev_state,
+				     enum vfio_device_mig_state new)
+{
+	enum vfio_device_mig_state cur = mdev_state->state;
+
+	dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new);
+
+	/*
+	 * The following state transitions are no-op considering
+	 * mtty does not do DMA nor require any explicit start/stop.
+	 *
+	 *         RUNNING -> RUNNING_P2P
+	 *         RUNNING_P2P -> RUNNING
+	 *         RUNNING_P2P -> STOP
+	 *         PRE_COPY -> PRE_COPY_P2P
+	 *         PRE_COPY_P2P -> PRE_COPY
+	 *         STOP -> RUNNING_P2P
+	 */
+	if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+	     new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+	     (new == VFIO_DEVICE_STATE_RUNNING ||
+	      new == VFIO_DEVICE_STATE_STOP)) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY &&
+	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+	     new == VFIO_DEVICE_STATE_PRE_COPY) ||
+	    (cur == VFIO_DEVICE_STATE_STOP &&
+	     new == VFIO_DEVICE_STATE_RUNNING_P2P))
+		return NULL;
+
+	/*
+	 * The following state transitions simply close migration files,
+	 * with the exception of RESUMING -> STOP, which needs to load
+	 * the state first.
+	 *
+	 *         RESUMING -> STOP
+	 *         PRE_COPY -> RUNNING
+	 *         PRE_COPY_P2P -> RUNNING_P2P
+	 *         STOP_COPY -> STOP
+	 */
+	if (cur == VFIO_DEVICE_STATE_RESUMING &&
+	    new == VFIO_DEVICE_STATE_STOP) {
+		int ret;
+
+		ret = mtty_load_state(mdev_state);
+		if (ret)
+			return ERR_PTR(ret);
+		mtty_disable_files(mdev_state);
+		return NULL;
+	}
+
+	if ((cur == VFIO_DEVICE_STATE_PRE_COPY &&
+	     new == VFIO_DEVICE_STATE_RUNNING) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+	     new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
+	    (cur == VFIO_DEVICE_STATE_STOP_COPY &&
+	     new == VFIO_DEVICE_STATE_STOP)) {
+		mtty_disable_files(mdev_state);
+		return NULL;
+	}
+
+	/*
+	 * The following state transitions return migration files.
+	 *
+	 *         RUNNING -> PRE_COPY
+	 *         RUNNING_P2P -> PRE_COPY_P2P
+	 *         STOP -> STOP_COPY
+	 *         STOP -> RESUMING
+	 *         PRE_COPY_P2P -> STOP_COPY
+	 */
+	if ((cur == VFIO_DEVICE_STATE_RUNNING &&
+	     new == VFIO_DEVICE_STATE_PRE_COPY) ||
+	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
+	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
+	    (cur == VFIO_DEVICE_STATE_STOP &&
+	     new == VFIO_DEVICE_STATE_STOP_COPY) ||
+	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
+	     new == VFIO_DEVICE_STATE_STOP_COPY)) {
+		struct mtty_migration_file *migf;
+
+		migf = mtty_save_device_data(mdev_state, new);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+
+		if (migf) {
+			get_file(migf->filp);
+
+			return migf->filp;
+		}
+		return NULL;
+	}
+
+	if (cur == VFIO_DEVICE_STATE_STOP &&
+	    new == VFIO_DEVICE_STATE_RESUMING) {
+		struct mtty_migration_file *migf;
+
+		migf = mtty_resume_device_data(mdev_state);
+		if (IS_ERR(migf))
+			return ERR_CAST(migf);
+
+		get_file(migf->filp);
+
+		return migf->filp;
+	}
+
+	/* vfio_mig_get_next_state() does not use arcs other than the above */
+	WARN_ON(true);
+	return ERR_PTR(-EINVAL);
+}
+
+static struct file *mtty_set_state(struct vfio_device *vdev,
+				   enum vfio_device_mig_state new_state)
+{
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
+	struct file *ret = NULL;
+
+	dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state);
+
+	mutex_lock(&mdev_state->state_mutex);
+	while (mdev_state->state != new_state) {
+		enum vfio_device_mig_state next_state;
+		int rc = vfio_mig_get_next_state(vdev, mdev_state->state,
+						 new_state, &next_state);
+		if (rc) {
+			ret = ERR_PTR(rc);
+			break;
+		}
+
+		ret = mtty_step_state(mdev_state, next_state);
+		if (IS_ERR(ret))
+			break;
+
+		mdev_state->state = next_state;
+
+		if (WARN_ON(ret && new_state != next_state)) {
+			fput(ret);
+			ret = ERR_PTR(-EINVAL);
+			break;
+		}
+	}
+	mtty_state_mutex_unlock(mdev_state);
+	return ret;
+}
+
+static int mtty_get_state(struct vfio_device *vdev,
+			  enum vfio_device_mig_state *current_state)
+{
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
+
+	mutex_lock(&mdev_state->state_mutex);
+	*current_state = mdev_state->state;
+	mtty_state_mutex_unlock(mdev_state);
+	return 0;
+}
+
+static int mtty_get_data_size(struct vfio_device *vdev,
+			      unsigned long *stop_copy_length)
+{
+	struct mdev_state *mdev_state =
+		container_of(vdev, struct mdev_state, vdev);
+
+	*stop_copy_length = mtty_data_size(mdev_state);
+	return 0;
+}
+
+static const struct vfio_migration_ops mtty_migration_ops = {
+	.migration_set_state = mtty_set_state,
+	.migration_get_state = mtty_get_state,
+	.migration_get_data_size = mtty_get_data_size,
+};
+
+static int mtty_log_start(struct vfio_device *vdev,
+			  struct rb_root_cached *ranges,
+			  u32 nnodes, u64 *page_size)
+{
+	return 0;
+}
+
+static int mtty_log_stop(struct vfio_device *vdev)
+{
+	return 0;
+}
+
+static int mtty_log_read_and_clear(struct vfio_device *vdev,
+				   unsigned long iova, unsigned long length,
+				   struct iova_bitmap *dirty)
+{
+	return 0;
+}
+
+static const struct vfio_log_ops mtty_log_ops = {
+	.log_start = mtty_log_start,
+	.log_stop = mtty_log_stop,
+	.log_read_and_clear = mtty_log_read_and_clear,
+};
+
 static int mtty_init_dev(struct vfio_device *vdev)
 {
 	struct mdev_state *mdev_state =
@@ -775,6 +1343,16 @@ static int mtty_init_dev(struct vfio_device *vdev)
 	mutex_init(&mdev_state->ops_lock);
 	mdev_state->mdev = mdev;
 	mtty_create_config_space(mdev_state);
+
+	mutex_init(&mdev_state->state_mutex);
+	mutex_init(&mdev_state->reset_mutex);
+	vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
+				VFIO_MIGRATION_P2P |
+				VFIO_MIGRATION_PRE_COPY;
+	vdev->mig_ops = &mtty_migration_ops;
+	vdev->log_ops = &mtty_log_ops;
+	mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
+
 	return 0;
 
 err_nr_ports:
@@ -808,6 +1386,8 @@ static void mtty_release_dev(struct vfio_device *vdev)
 	struct mdev_state *mdev_state =
 		container_of(vdev, struct mdev_state, vdev);
 
+	mutex_destroy(&mdev_state->reset_mutex);
+	mutex_destroy(&mdev_state->state_mutex);
 	atomic_add(mdev_state->nr_ports, &mdev_avail_ports);
 	kfree(mdev_state->vconfig);
 }
@@ -824,6 +1404,15 @@ static int mtty_reset(struct mdev_state *mdev_state)
 {
 	pr_info("%s: called\n", __func__);
 
+	mutex_lock(&mdev_state->reset_mutex);
+	mdev_state->deferred_reset = true;
+	if (!mutex_trylock(&mdev_state->state_mutex)) {
+		mutex_unlock(&mdev_state->reset_mutex);
+		return 0;
+	}
+	mutex_unlock(&mdev_state->reset_mutex);
+	mtty_state_mutex_unlock(mdev_state);
+
 	return 0;
 }
 
@@ -1350,6 +1939,7 @@ static void mtty_close(struct vfio_device *vdev)
 	struct mdev_state *mdev_state =
 				container_of(vdev, struct mdev_state, vdev);
 
+	mtty_disable_files(mdev_state);
 	mtty_disable_intx(mdev_state);
 	mtty_disable_msi(mdev_state);
 }