linux-stable/include/linux/vfio.h

248 lines
8.1 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-only */
/*
* VFIO API definition
*
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
* Author: Alex Williamson <alex.williamson@redhat.com>
*/
#ifndef VFIO_H
#define VFIO_H
#include <linux/iommu.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/poll.h>
#include <uapi/linux/vfio.h>
struct kvm;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
/*
* VFIO devices can be placed in a set, this allows all devices to share this
* structure and the VFIO core will provide a lock that is held around
* open_device()/close_device() for all devices in the set.
*/
struct vfio_device_set {
void *set_id;
struct mutex lock;
struct list_head device_list;
unsigned int device_count;
};
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
struct vfio_device {
struct device *dev;
const struct vfio_device_ops *ops;
/*
* mig_ops is a static property of the vfio_device which must be set
* prior to registering the vfio_device.
*/
const struct vfio_migration_ops *mig_ops;
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
struct vfio_group *group;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
struct vfio_device_set *dev_set;
struct list_head dev_set_list;
unsigned int migration_flags;
/* Driver must reference the kvm during open_device or never touch it */
struct kvm *kvm;
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
/* Members below here are private, not for driver use */
refcount_t refcount;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
unsigned int open_count;
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
struct completion comp;
struct list_head group_next;
};
/**
* struct vfio_device_ops - VFIO bus driver device callbacks
*
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
* @open_device: Called when the first file descriptor is opened for this device
* @close_device: Opposite of open_device
* @read: Perform read(2) on device file descriptor
* @write: Perform write(2) on device file descriptor
* @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_*
* operations documented below
* @mmap: Perform mmap(2) on a region of the device file descriptor
* @request: Request for the bus driver to release the device
* @match: Optional device name match callback (return: 0 for no-match, >0 for
* match, -errno for abort (ex. match with insufficient or incorrect
* additional args)
* @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
*/
struct vfio_device_ops {
char *name;
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
int (*open_device)(struct vfio_device *vdev);
void (*close_device)(struct vfio_device *vdev);
ssize_t (*read)(struct vfio_device *vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t (*write)(struct vfio_device *vdev, const char __user *buf,
size_t count, loff_t *size);
long (*ioctl)(struct vfio_device *vdev, unsigned int cmd,
unsigned long arg);
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
int (*device_feature)(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz);
};
/**
* @migration_set_state: Optional callback to change the migration state for
* devices that support migration. It's mandatory for
* VFIO_DEVICE_FEATURE_MIGRATION migration support.
* The returned FD is used for data transfer according to the FSM
* definition. The driver is responsible to ensure that FD reaches end
* of stream or error whenever the migration FSM leaves a data transfer
* state or before close_device() returns.
* @migration_get_state: Optional callback to get the migration state for
* devices that support migration. It's mandatory for
* VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_migration_ops {
vfio: Define device migration protocol v2 Replace the existing region based migration protocol with an ioctl based protocol. The two protocols have the same general semantic behaviors, but the way the data is transported is changed. This is the STOP_COPY portion of the new protocol, it defines the 5 states for basic stop and copy migration and the protocol to move the migration data in/out of the kernel. Compared to the clarification of the v1 protocol Alex proposed: https://lore.kernel.org/r/163909282574.728533.7460416142511440919.stgit@omen This has a few deliberate functional differences: - ERROR arcs allow the device function to remain unchanged. - The protocol is not required to return to the original state on transition failure. Instead userspace can execute an unwind back to the original state, reset, or do something else without needing kernel support. This simplifies the kernel design and should userspace choose a policy like always reset, avoids doing useless work in the kernel on error handling paths. - PRE_COPY is made optional, userspace must discover it before using it. This reflects the fact that the majority of drivers we are aware of right now will not implement PRE_COPY. - segmentation is not part of the data stream protocol, the receiver does not have to reproduce the framing boundaries. The hybrid FSM for the device_state is described as a Mealy machine by documenting each of the arcs the driver is required to implement. Defining the remaining set of old/new device_state transitions as 'combination transitions' which are naturally defined as taking multiple FSM arcs along the shortest path within the FSM's digraph allows a complete matrix of transitions. A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE is defined to replace writing to the device_state field in the region. This allows returning a brand new FD whenever the requested transition opens a data transfer session. The VFIO core code implements the new feature and provides a helper function to the driver. Using the helper the driver only has to implement 6 of the FSM arcs and the other combination transitions are elaborated consistently from those arcs. A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIGRATION is defined to report the capability for migration and indicate which set of states and arcs are supported by the device. The FSM provides a lot of flexibility to make backwards compatible extensions but the VFIO_DEVICE_FEATURE also allows for future breaking extensions for scenarios that cannot support even the basic STOP_COPY requirements. The VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE with the GET option (i.e. VFIO_DEVICE_FEATURE_GET) can be used to read the current migration state of the VFIO device. Data transfer sessions are now carried over a file descriptor, instead of the region. The FD functions for the lifetime of the data transfer session. read() and write() transfer the data with normal Linux stream FD semantics. This design allows future expansion to support poll(), io_uring, and other performance optimizations. The complicated mmap mode for data transfer is discarded as current qemu doesn't take meaningful advantage of it, and the new qemu implementation avoids substantially all the performance penalty of using a read() on the region. Link: https://lore.kernel.org/all/20220224142024.147653-10-yishaih@nvidia.com Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Alex Williamson <alex.williamson@redhat.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2022-02-24 14:20:18 +00:00
struct file *(*migration_set_state)(
struct vfio_device *device,
enum vfio_device_mig_state new_state);
int (*migration_get_state)(struct vfio_device *device,
enum vfio_device_mig_state *curr_state);
};
/**
* vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
* @flags: Arg from the device_feature op
* @argsz: Arg from the device_feature op
* @supported_ops: Combination of VFIO_DEVICE_FEATURE_GET and SET the driver
* supports
* @minsz: Minimum data size the driver accepts
*
* For use in a driver's device_feature op. Checks that the inputs to the
* VFIO_DEVICE_FEATURE ioctl are correct for the driver's feature. Returns 1 if
* the driver should execute the get or set, otherwise the relevant
* value should be returned.
*/
static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
size_t minsz)
{
if ((flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)) &
~supported_ops)
return -EINVAL;
if (flags & VFIO_DEVICE_FEATURE_PROBE)
return 0;
/* Without PROBE one of GET or SET must be requested */
if (!(flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)))
return -EINVAL;
if (argsz < minsz)
return -EINVAL;
return 1;
}
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
const struct vfio_device_ops *ops);
void vfio_uninit_group_dev(struct vfio_device *device);
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
int vfio_register_group_dev(struct vfio_device *device);
int vfio_register_emulated_iommu_dev(struct vfio_device *device);
vfio: Split creation of a vfio_device into init and register ops This makes the struct vfio_device part of the public interface so it can be used with container_of and so forth, as is typical for a Linux subystem. This is the first step to bring some type-safety to the vfio interface by allowing the replacement of 'void *' and 'struct device *' inputs with a simple and clear 'struct vfio_device *' For now the self-allocating vfio_add_group_dev() interface is kept so each user can be updated as a separate patch. The expected usage pattern is driver core probe() function: my_device = kzalloc(sizeof(*mydevice)); vfio_init_group_dev(&my_device->vdev, dev, ops, mydevice); /* other driver specific prep */ vfio_register_group_dev(&my_device->vdev); dev_set_drvdata(dev, my_device); driver core remove() function: my_device = dev_get_drvdata(dev); vfio_unregister_group_dev(&my_device->vdev); /* other driver specific tear down */ kfree(my_device); Allowing the driver to be able to use the drvdata and vfio_device to go to/from its own data. The pattern also makes it clear that vfio_register_group_dev() must be last in the sequence, as once it is called the core code can immediately start calling ops. The init/register gap is provided to allow for the driver to do setup before ops can be called and thus avoid races. Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Liu Yi L <yi.l.liu@intel.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Eric Auger <eric.auger@redhat.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Message-Id: <3-v3-225de1400dfc+4e074-vfio1_jgg@nvidia.com> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-03-30 15:53:05 +00:00
void vfio_unregister_group_dev(struct vfio_device *device);
vfio: Provide better generic support for open/release vfio_device_ops Currently the driver ops have an open/release pair that is called once each time a device FD is opened or closed. Add an additional set of open/close_device() ops which are called when the device FD is opened for the first time and closed for the last time. An analysis shows that all of the drivers require this semantic. Some are open coding it as part of their reflck implementation, and some are just buggy and miss it completely. To retain the current semantics PCI and FSL depend on, introduce the idea of a "device set" which is a grouping of vfio_device's that share the same lock around opening. The device set is established by providing a 'set_id' pointer. All vfio_device's that provide the same pointer will be joined to the same singleton memory and lock across the whole set. This effectively replaces the oddly named reflck. After conversion the set_id will be sourced from: - A struct device from a fsl_mc_device (fsl) - A struct pci_slot (pci) - A struct pci_bus (pci) - The struct vfio_device (everything) The design ensures that the above pointers are live as long as the vfio_device is registered, so they form reliable unique keys to group vfio_devices into sets. This implementation uses xarray instead of searching through the driver core structures, which simplifies the somewhat tricky locking in this area. Following patches convert all the drivers. Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Link: https://lore.kernel.org/r/4-v4-9ea22c5e6afb+1adf-vfio_reflck_jgg@nvidia.com Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2021-08-06 01:19:00 +00:00
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
vfio: Define device migration protocol v2 Replace the existing region based migration protocol with an ioctl based protocol. The two protocols have the same general semantic behaviors, but the way the data is transported is changed. This is the STOP_COPY portion of the new protocol, it defines the 5 states for basic stop and copy migration and the protocol to move the migration data in/out of the kernel. Compared to the clarification of the v1 protocol Alex proposed: https://lore.kernel.org/r/163909282574.728533.7460416142511440919.stgit@omen This has a few deliberate functional differences: - ERROR arcs allow the device function to remain unchanged. - The protocol is not required to return to the original state on transition failure. Instead userspace can execute an unwind back to the original state, reset, or do something else without needing kernel support. This simplifies the kernel design and should userspace choose a policy like always reset, avoids doing useless work in the kernel on error handling paths. - PRE_COPY is made optional, userspace must discover it before using it. This reflects the fact that the majority of drivers we are aware of right now will not implement PRE_COPY. - segmentation is not part of the data stream protocol, the receiver does not have to reproduce the framing boundaries. The hybrid FSM for the device_state is described as a Mealy machine by documenting each of the arcs the driver is required to implement. Defining the remaining set of old/new device_state transitions as 'combination transitions' which are naturally defined as taking multiple FSM arcs along the shortest path within the FSM's digraph allows a complete matrix of transitions. A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE is defined to replace writing to the device_state field in the region. This allows returning a brand new FD whenever the requested transition opens a data transfer session. The VFIO core code implements the new feature and provides a helper function to the driver. Using the helper the driver only has to implement 6 of the FSM arcs and the other combination transitions are elaborated consistently from those arcs. A new VFIO_DEVICE_FEATURE of VFIO_DEVICE_FEATURE_MIGRATION is defined to report the capability for migration and indicate which set of states and arcs are supported by the device. The FSM provides a lot of flexibility to make backwards compatible extensions but the VFIO_DEVICE_FEATURE also allows for future breaking extensions for scenarios that cannot support even the basic STOP_COPY requirements. The VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE with the GET option (i.e. VFIO_DEVICE_FEATURE_GET) can be used to read the current migration state of the VFIO device. Data transfer sessions are now carried over a file descriptor, instead of the region. The FD functions for the lifetime of the data transfer session. read() and write() transfer the data with normal Linux stream FD semantics. This design allows future expansion to support poll(), io_uring, and other performance optimizations. The complicated mmap mode for data transfer is discarded as current qemu doesn't take meaningful advantage of it, and the new qemu implementation avoids substantially all the performance penalty of using a read() on the region. Link: https://lore.kernel.org/all/20220224142024.147653-10-yishaih@nvidia.com Signed-off-by: Jason Gunthorpe <jgg@nvidia.com> Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com> Reviewed-by: Kevin Tian <kevin.tian@intel.com> Reviewed-by: Alex Williamson <alex.williamson@redhat.com> Reviewed-by: Cornelia Huck <cohuck@redhat.com> Signed-off-by: Yishai Hadas <yishaih@nvidia.com> Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
2022-02-24 14:20:18 +00:00
int vfio_mig_get_next_state(struct vfio_device *device,
enum vfio_device_mig_state cur_fsm,
enum vfio_device_mig_state new_fsm,
enum vfio_device_mig_state *next_fsm);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-05 16:52:36 +00:00
/*
* External user API
*/
struct iommu_group *vfio_file_iommu_group(struct file *file);
bool vfio_file_enforced_coherent(struct file *file);
void vfio_file_set_kvm(struct file *file, struct kvm *kvm);
bool vfio_file_has_dev(struct file *file, struct vfio_device *device);
vfio: add external user support VFIO is designed to be used via ioctls on file descriptors returned by VFIO. However in some situations support for an external user is required. The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to use the existing VFIO groups for exclusive access in real/virtual mode on a host to avoid passing map/unmap requests to the user space which would made things pretty slow. The protocol includes: 1. do normal VFIO init operation: - opening a new container; - attaching group(s) to it; - setting an IOMMU driver for a container. When IOMMU is set for a container, all groups in it are considered ready to use by an external user. 2. User space passes a group fd to an external user. The external user calls vfio_group_get_external_user() to verify that: - the group is initialized; - IOMMU is set for it. If both checks passed, vfio_group_get_external_user() increments the container user counter to prevent the VFIO group from disposal before KVM exits. 3. The external user calls vfio_external_user_iommu_id() to know an IOMMU ID. PPC64 KVM uses it to link logical bus number (LIOBN) with IOMMU ID. 4. When the external KVM finishes, it calls vfio_group_put_external_user() to release the VFIO group. This call decrements the container user counter. Everything gets released. The "vfio: Limit group opens" patch is also required for the consistency. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2013-08-05 16:52:36 +00:00
#define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long))
int vfio_pin_pages(struct vfio_device *device, unsigned long *user_pfn,
int npage, int prot, unsigned long *phys_pfn);
int vfio_unpin_pages(struct vfio_device *device, unsigned long *user_pfn,
int npage);
int vfio_dma_rw(struct vfio_device *device, dma_addr_t user_iova,
void *data, size_t len, bool write);
/* each type has independent events */
enum vfio_notify_type {
VFIO_IOMMU_NOTIFY = 0,
};
/* events for VFIO_IOMMU_NOTIFY */
#define VFIO_IOMMU_NOTIFY_DMA_UNMAP BIT(0)
int vfio_register_notifier(struct vfio_device *device,
enum vfio_notify_type type,
unsigned long *required_events,
struct notifier_block *nb);
int vfio_unregister_notifier(struct vfio_device *device,
enum vfio_notify_type type,
struct notifier_block *nb);
/*
* Sub-module helpers
*/
struct vfio_info_cap {
struct vfio_info_cap_header *buf;
size_t size;
};
struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
size_t size, u16 id,
u16 version);
void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset);
int vfio_info_add_capability(struct vfio_info_cap *caps,
struct vfio_info_cap_header *cap, size_t size);
int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr,
int num_irqs, int max_irq_type,
size_t *data_size);
struct pci_dev;
#if IS_ENABLED(CONFIG_VFIO_SPAPR_EEH)
void vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group, unsigned int cmd,
unsigned long arg);
#else
static inline void vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
{
}
static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
{
}
static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
unsigned int cmd,
unsigned long arg)
{
return -ENOTTY;
}
#endif /* CONFIG_VFIO_SPAPR_EEH */
/*
* IRQfd - generic
*/
struct virqfd {
void *opaque;
struct eventfd_ctx *eventfd;
int (*handler)(void *, void *);
void (*thread)(void *, void *);
void *data;
struct work_struct inject;
wait_queue_entry_t wait;
poll_table pt;
struct work_struct shutdown;
struct virqfd **pvirqfd;
};
int vfio_virqfd_enable(void *opaque, int (*handler)(void *, void *),
void (*thread)(void *, void *), void *data,
struct virqfd **pvirqfd, int fd);
void vfio_virqfd_disable(struct virqfd **pvirqfd);
#endif /* VFIO_H */