IOMMU Fixes for Linux v5.7-rc4

Including:
 
 	- The race condition fixes for the AMD IOMMU driver. This are 5
 	  patches fixing two race conditions around
 	  increase_address_space(). The first race condition was around
 	  the non-atomic update of the domain page-table root pointer
 	  and the variable containing the page-table depth (called
 	  mode). This is fixed now be merging page-table root and mode
 	  into one 64-bit field which is read/written atomically.
 
 	  The second race condition was around updating the page-table
 	  root pointer and making it public before the hardware caches
 	  were flushed. This could cause addresses to be mapped and
 	  returned to drivers which are not reachable by IOMMU hardware
 	  yet, causing IO page-faults. This is fixed too by adding the
 	  necessary flushes before a new page-table root is published.
 
 	  Related to the race condition fixes these patches also add a
 	  missing domain_flush_complete() barrier to update_domain() and
 	  a fix to bail out of the loop which tries to increase the
 	  address space when the call to increase_address_space() fails.
 
 	  Qian was able to trigger the race conditions under high load
 	  and memory pressure within a few days of testing. He confirmed
 	  that he has seen no issues anymore with the fixes included
 	  here.
 
 	- Fix for a list-handling bug in the VirtIO IOMMU driver.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEr9jSbILcajRFYWYyK/BELZcBGuMFAl638MYACgkQK/BELZcB
 GuOQxQ/5AYorgKuGkqVbob69YWZuSAEG08dlzDw4C8CDnPKXEPd0L4gJGLP7BpEh
 bPJo9QJtXW7zG6Hhk8sWk9/iONsThngoudaQrodJwaQRdCDGaDZlvBaezG2Vx4xb
 A2OrcM9lvQSODdgyf3x0O1cX7vkQ4J6nJR1Z8Fw4EufjH6TS9DR0tf8ZWHtIpHa6
 Josu3M+qhUXPsn7KK5o7GtNib7sI4whLldYaASGsuaFGzod3CgA0cgmL2HfD+DWP
 k1EIEZTCaOq0BamtpyXbSA6o0AxwKERr/KONi1pL0xN4r0yCjsxEQ6+Rw4caqvgA
 zrfv3kk4a+wFAxOe0hUEtKk8Oy587LPJvIX4FnjG8hRnBrEaQC9vy4eMj05utPid
 PpsNQ35P+SyrxTlIp7ybIVhUvKbxih8SSpRsjx16vX+r/h4SRvWHzjpHVq/4+gIT
 TeZGw1g7xCIyjzn5HqLs/nMG/Ly9QHQaWia8slJJgbzI/deUXAVTy6PmMrqHB+zv
 e0PelKsq5lEQBrFX+r/Sg5hBViKaMykXKbXXg3KIolzlutJc2Rrzh4EEKpP/ug2/
 upTXf+NvMobNxb3QLqn3IJApIirEGYQqI7lwjiUwTC5xb3EfYLUuRa5i4fbOAZIv
 krsVM4sNX1S32TblTMzDDOEEggPG1wPhVF5B+1emOolYHek3ShI=
 =gqwr
 -----END PGP SIGNATURE-----

Merge tag 'iommu-fixes-v5.7-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu

Pull iommu fixes from Joerg Roedel:

 - Race condition fixes for the AMD IOMMU driver.

   These are five patches fixing two race conditions around
   increase_address_space(). The first race condition was around the
   non-atomic update of the domain page-table root pointer and the
   variable containing the page-table depth (called mode). This is fixed
   now be merging page-table root and mode into one 64-bit field which
   is read/written atomically.

   The second race condition was around updating the page-table root
   pointer and making it public before the hardware caches were flushed.
   This could cause addresses to be mapped and returned to drivers which
   are not reachable by IOMMU hardware yet, causing IO page-faults. This
   is fixed too by adding the necessary flushes before a new page-table
   root is published.

   Related to the race condition fixes these patches also add a missing
   domain_flush_complete() barrier to update_domain() and a fix to bail
   out of the loop which tries to increase the address space when the
   call to increase_address_space() fails.

   Qian was able to trigger the race conditions under high load and
   memory pressure within a few days of testing. He confirmed that he
   has seen no issues anymore with the fixes included here.

 - Fix for a list-handling bug in the VirtIO IOMMU driver.

* tag 'iommu-fixes-v5.7-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/virtio: Reverse arguments to list_add
  iommu/amd: Do not flush Device Table in iommu_map_page()
  iommu/amd: Update Device Table in increase_address_space()
  iommu/amd: Call domain_flush_complete() in update_domain()
  iommu/amd: Do not loop forever when trying to increase address space
  iommu/amd: Fix race in increase_address_space()/fetch_pte()
This commit is contained in:
Linus Torvalds 2020-05-10 11:26:23 -07:00
commit 27d2dcb1b9
3 changed files with 162 additions and 47 deletions

View File

@ -101,6 +101,8 @@ struct kmem_cache *amd_iommu_irq_cache;
static void update_domain(struct protection_domain *domain);
static int protection_domain_init(struct protection_domain *domain);
static void detach_device(struct device *dev);
static void update_and_flush_device_table(struct protection_domain *domain,
struct domain_pgtable *pgtable);
/****************************************************************************
*
@ -151,6 +153,26 @@ static struct protection_domain *to_pdomain(struct iommu_domain *dom)
return container_of(dom, struct protection_domain, domain);
}
static void amd_iommu_domain_get_pgtable(struct protection_domain *domain,
struct domain_pgtable *pgtable)
{
u64 pt_root = atomic64_read(&domain->pt_root);
pgtable->root = (u64 *)(pt_root & PAGE_MASK);
pgtable->mode = pt_root & 7; /* lowest 3 bits encode pgtable mode */
}
static u64 amd_iommu_domain_encode_pgtable(u64 *root, int mode)
{
u64 pt_root;
/* lowest 3 bits encode pgtable mode */
pt_root = mode & 7;
pt_root |= (u64)root;
return pt_root;
}
static struct iommu_dev_data *alloc_dev_data(u16 devid)
{
struct iommu_dev_data *dev_data;
@ -1397,13 +1419,18 @@ static struct page *free_sub_pt(unsigned long root, int mode,
static void free_pagetable(struct protection_domain *domain)
{
unsigned long root = (unsigned long)domain->pt_root;
struct domain_pgtable pgtable;
struct page *freelist = NULL;
unsigned long root;
BUG_ON(domain->mode < PAGE_MODE_NONE ||
domain->mode > PAGE_MODE_6_LEVEL);
amd_iommu_domain_get_pgtable(domain, &pgtable);
atomic64_set(&domain->pt_root, 0);
freelist = free_sub_pt(root, domain->mode, freelist);
BUG_ON(pgtable.mode < PAGE_MODE_NONE ||
pgtable.mode > PAGE_MODE_6_LEVEL);
root = (unsigned long)pgtable.root;
freelist = free_sub_pt(root, pgtable.mode, freelist);
free_page_list(freelist);
}
@ -1417,24 +1444,39 @@ static bool increase_address_space(struct protection_domain *domain,
unsigned long address,
gfp_t gfp)
{
struct domain_pgtable pgtable;
unsigned long flags;
bool ret = false;
u64 *pte;
bool ret = true;
u64 *pte, root;
spin_lock_irqsave(&domain->lock, flags);
if (address <= PM_LEVEL_SIZE(domain->mode) ||
WARN_ON_ONCE(domain->mode == PAGE_MODE_6_LEVEL))
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (address <= PM_LEVEL_SIZE(pgtable.mode))
goto out;
ret = false;
if (WARN_ON_ONCE(pgtable.mode == PAGE_MODE_6_LEVEL))
goto out;
pte = (void *)get_zeroed_page(gfp);
if (!pte)
goto out;
*pte = PM_LEVEL_PDE(domain->mode,
iommu_virt_to_phys(domain->pt_root));
domain->pt_root = pte;
domain->mode += 1;
*pte = PM_LEVEL_PDE(pgtable.mode, iommu_virt_to_phys(pgtable.root));
pgtable.root = pte;
pgtable.mode += 1;
update_and_flush_device_table(domain, &pgtable);
domain_flush_complete(domain);
/*
* Device Table needs to be updated and flushed before the new root can
* be published.
*/
root = amd_iommu_domain_encode_pgtable(pte, pgtable.mode);
atomic64_set(&domain->pt_root, root);
ret = true;
@ -1451,16 +1493,29 @@ static u64 *alloc_pte(struct protection_domain *domain,
gfp_t gfp,
bool *updated)
{
struct domain_pgtable pgtable;
int level, end_lvl;
u64 *pte, *page;
BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
*updated = increase_address_space(domain, address, gfp) || *updated;
amd_iommu_domain_get_pgtable(domain, &pgtable);
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
while (address > PM_LEVEL_SIZE(pgtable.mode)) {
/*
* Return an error if there is no memory to update the
* page-table.
*/
if (!increase_address_space(domain, address, gfp))
return NULL;
/* Read new values to check if update was successful */
amd_iommu_domain_get_pgtable(domain, &pgtable);
}
level = pgtable.mode - 1;
pte = &pgtable.root[PM_LEVEL_INDEX(level, address)];
address = PAGE_SIZE_ALIGN(address, page_size);
end_lvl = PAGE_SIZE_LEVEL(page_size);
@ -1536,16 +1591,19 @@ static u64 *fetch_pte(struct protection_domain *domain,
unsigned long address,
unsigned long *page_size)
{
struct domain_pgtable pgtable;
int level;
u64 *pte;
*page_size = 0;
if (address > PM_LEVEL_SIZE(domain->mode))
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (address > PM_LEVEL_SIZE(pgtable.mode))
return NULL;
level = domain->mode - 1;
pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
level = pgtable.mode - 1;
pte = &pgtable.root[PM_LEVEL_INDEX(level, address)];
*page_size = PTE_LEVEL_PAGE_SIZE(level);
while (level > 0) {
@ -1660,7 +1718,13 @@ out:
unsigned long flags;
spin_lock_irqsave(&dom->lock, flags);
update_domain(dom);
/*
* Flush domain TLB(s) and wait for completion. Any Device-Table
* Updates and flushing already happened in
* increase_address_space().
*/
domain_flush_tlb_pde(dom);
domain_flush_complete(dom);
spin_unlock_irqrestore(&dom->lock, flags);
}
@ -1806,6 +1870,7 @@ static void dma_ops_domain_free(struct protection_domain *domain)
static struct protection_domain *dma_ops_domain_alloc(void)
{
struct protection_domain *domain;
u64 *pt_root, root;
domain = kzalloc(sizeof(struct protection_domain), GFP_KERNEL);
if (!domain)
@ -1814,12 +1879,14 @@ static struct protection_domain *dma_ops_domain_alloc(void)
if (protection_domain_init(domain))
goto free_domain;
domain->mode = PAGE_MODE_3_LEVEL;
domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
domain->flags = PD_DMA_OPS_MASK;
if (!domain->pt_root)
pt_root = (void *)get_zeroed_page(GFP_KERNEL);
if (!pt_root)
goto free_domain;
root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL);
atomic64_set(&domain->pt_root, root);
domain->flags = PD_DMA_OPS_MASK;
if (iommu_get_dma_cookie(&domain->domain) == -ENOMEM)
goto free_domain;
@ -1841,16 +1908,17 @@ static bool dma_ops_domain(struct protection_domain *domain)
}
static void set_dte_entry(u16 devid, struct protection_domain *domain,
struct domain_pgtable *pgtable,
bool ats, bool ppr)
{
u64 pte_root = 0;
u64 flags = 0;
u32 old_domid;
if (domain->mode != PAGE_MODE_NONE)
pte_root = iommu_virt_to_phys(domain->pt_root);
if (pgtable->mode != PAGE_MODE_NONE)
pte_root = iommu_virt_to_phys(pgtable->root);
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
pte_root |= (pgtable->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V | DTE_FLAG_TV;
@ -1923,6 +1991,7 @@ static void clear_dte_entry(u16 devid)
static void do_attach(struct iommu_dev_data *dev_data,
struct protection_domain *domain)
{
struct domain_pgtable pgtable;
struct amd_iommu *iommu;
bool ats;
@ -1938,7 +2007,9 @@ static void do_attach(struct iommu_dev_data *dev_data,
domain->dev_cnt += 1;
/* Update device table */
set_dte_entry(dev_data->devid, domain, ats, dev_data->iommu_v2);
amd_iommu_domain_get_pgtable(domain, &pgtable);
set_dte_entry(dev_data->devid, domain, &pgtable,
ats, dev_data->iommu_v2);
clone_aliases(dev_data->pdev);
device_flush_dte(dev_data);
@ -2249,23 +2320,36 @@ static int amd_iommu_domain_get_attr(struct iommu_domain *domain,
*
*****************************************************************************/
static void update_device_table(struct protection_domain *domain)
static void update_device_table(struct protection_domain *domain,
struct domain_pgtable *pgtable)
{
struct iommu_dev_data *dev_data;
list_for_each_entry(dev_data, &domain->dev_list, list) {
set_dte_entry(dev_data->devid, domain, dev_data->ats.enabled,
dev_data->iommu_v2);
set_dte_entry(dev_data->devid, domain, pgtable,
dev_data->ats.enabled, dev_data->iommu_v2);
clone_aliases(dev_data->pdev);
}
}
static void update_and_flush_device_table(struct protection_domain *domain,
struct domain_pgtable *pgtable)
{
update_device_table(domain, pgtable);
domain_flush_devices(domain);
}
static void update_domain(struct protection_domain *domain)
{
update_device_table(domain);
struct domain_pgtable pgtable;
domain_flush_devices(domain);
/* Update device table */
amd_iommu_domain_get_pgtable(domain, &pgtable);
update_and_flush_device_table(domain, &pgtable);
/* Flush domain TLB(s) and wait for completion */
domain_flush_tlb_pde(domain);
domain_flush_complete(domain);
}
int __init amd_iommu_init_api(void)
@ -2375,6 +2459,7 @@ out_err:
static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
{
struct protection_domain *pdomain;
u64 *pt_root, root;
switch (type) {
case IOMMU_DOMAIN_UNMANAGED:
@ -2382,13 +2467,15 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
if (!pdomain)
return NULL;
pdomain->mode = PAGE_MODE_3_LEVEL;
pdomain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
if (!pdomain->pt_root) {
pt_root = (void *)get_zeroed_page(GFP_KERNEL);
if (!pt_root) {
protection_domain_free(pdomain);
return NULL;
}
root = amd_iommu_domain_encode_pgtable(pt_root, PAGE_MODE_3_LEVEL);
atomic64_set(&pdomain->pt_root, root);
pdomain->domain.geometry.aperture_start = 0;
pdomain->domain.geometry.aperture_end = ~0ULL;
pdomain->domain.geometry.force_aperture = true;
@ -2406,7 +2493,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
if (!pdomain)
return NULL;
pdomain->mode = PAGE_MODE_NONE;
atomic64_set(&pdomain->pt_root, PAGE_MODE_NONE);
break;
default:
return NULL;
@ -2418,6 +2505,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
static void amd_iommu_domain_free(struct iommu_domain *dom)
{
struct protection_domain *domain;
struct domain_pgtable pgtable;
domain = to_pdomain(dom);
@ -2435,7 +2523,9 @@ static void amd_iommu_domain_free(struct iommu_domain *dom)
dma_ops_domain_free(domain);
break;
default:
if (domain->mode != PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode != PAGE_MODE_NONE)
free_pagetable(domain);
if (domain->flags & PD_IOMMUV2_MASK)
@ -2518,10 +2608,12 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
gfp_t gfp)
{
struct protection_domain *domain = to_pdomain(dom);
struct domain_pgtable pgtable;
int prot = 0;
int ret;
if (domain->mode == PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode == PAGE_MODE_NONE)
return -EINVAL;
if (iommu_prot & IOMMU_READ)
@ -2541,8 +2633,10 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
struct iommu_iotlb_gather *gather)
{
struct protection_domain *domain = to_pdomain(dom);
struct domain_pgtable pgtable;
if (domain->mode == PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode == PAGE_MODE_NONE)
return 0;
return iommu_unmap_page(domain, iova, page_size);
@ -2553,9 +2647,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
{
struct protection_domain *domain = to_pdomain(dom);
unsigned long offset_mask, pte_pgsize;
struct domain_pgtable pgtable;
u64 *pte, __pte;
if (domain->mode == PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode == PAGE_MODE_NONE)
return iova;
pte = fetch_pte(domain, iova, &pte_pgsize);
@ -2708,16 +2804,26 @@ EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier);
void amd_iommu_domain_direct_map(struct iommu_domain *dom)
{
struct protection_domain *domain = to_pdomain(dom);
struct domain_pgtable pgtable;
unsigned long flags;
u64 pt_root;
spin_lock_irqsave(&domain->lock, flags);
/* First save pgtable configuration*/
amd_iommu_domain_get_pgtable(domain, &pgtable);
/* Update data structure */
domain->mode = PAGE_MODE_NONE;
pt_root = amd_iommu_domain_encode_pgtable(NULL, PAGE_MODE_NONE);
atomic64_set(&domain->pt_root, pt_root);
/* Make changes visible to IOMMUs */
update_domain(domain);
/* Restore old pgtable in domain->ptroot to free page-table */
pt_root = amd_iommu_domain_encode_pgtable(pgtable.root, pgtable.mode);
atomic64_set(&domain->pt_root, pt_root);
/* Page-table is not visible to IOMMU anymore, so free it */
free_pagetable(domain);
@ -2908,9 +3014,11 @@ static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc)
static int __set_gcr3(struct protection_domain *domain, int pasid,
unsigned long cr3)
{
struct domain_pgtable pgtable;
u64 *pte;
if (domain->mode != PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode != PAGE_MODE_NONE)
return -EINVAL;
pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true);
@ -2924,9 +3032,11 @@ static int __set_gcr3(struct protection_domain *domain, int pasid,
static int __clear_gcr3(struct protection_domain *domain, int pasid)
{
struct domain_pgtable pgtable;
u64 *pte;
if (domain->mode != PAGE_MODE_NONE)
amd_iommu_domain_get_pgtable(domain, &pgtable);
if (pgtable.mode != PAGE_MODE_NONE)
return -EINVAL;
pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false);

View File

@ -468,8 +468,7 @@ struct protection_domain {
iommu core code */
spinlock_t lock; /* mostly used to lock the page table*/
u16 id; /* the domain id written to the device table */
int mode; /* paging mode (0-6 levels) */
u64 *pt_root; /* page table root pointer */
atomic64_t pt_root; /* pgtable root and pgtable mode */
int glx; /* Number of levels for GCR3 table */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
@ -477,6 +476,12 @@ struct protection_domain {
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
/* For decocded pt_root */
struct domain_pgtable {
int mode;
u64 *root;
};
/*
* Structure where we save information about one hardware AMD IOMMU in the
* system.

View File

@ -453,7 +453,7 @@ static int viommu_add_resv_mem(struct viommu_endpoint *vdev,
if (!region)
return -ENOMEM;
list_add(&vdev->resv_regions, &region->list);
list_add(&region->list, &vdev->resv_regions);
return 0;
}