From c3811a50addd23b9bb5a36278609ee1638debcf6 Mon Sep 17 00:00:00 2001
From: Wei Huang <wei.huang2@amd.com>
Date: Fri, 20 Aug 2021 15:29:55 -0500
Subject: [PATCH 1/5] iommu/amd: Relocate GAMSup check to early_enable_iommus

Currently, iommu_init_ga() checks and disables IOMMU VAPIC support
(i.e. AMD AVIC support in IOMMU) when GAMSup feature bit is not set.
However it forgets to clear IRQ_POSTING_CAP from the previously set
amd_iommu_irq_ops.capability.

This triggers an invalid page fault bug during guest VM warm reboot
if AVIC is enabled since the irq_remapping_cap(IRQ_POSTING_CAP) is
incorrectly set, and crash the system with the following kernel trace.

    BUG: unable to handle page fault for address: 0000000000400dd8
    RIP: 0010:amd_iommu_deactivate_guest_mode+0x19/0xbc
    Call Trace:
     svm_set_pi_irte_mode+0x8a/0xc0 [kvm_amd]
     ? kvm_make_all_cpus_request_except+0x50/0x70 [kvm]
     kvm_request_apicv_update+0x10c/0x150 [kvm]
     svm_toggle_avic_for_irq_window+0x52/0x90 [kvm_amd]
     svm_enable_irq_window+0x26/0xa0 [kvm_amd]
     vcpu_enter_guest+0xbbe/0x1560 [kvm]
     ? avic_vcpu_load+0xd5/0x120 [kvm_amd]
     ? kvm_arch_vcpu_load+0x76/0x240 [kvm]
     ? svm_get_segment_base+0xa/0x10 [kvm_amd]
     kvm_arch_vcpu_ioctl_run+0x103/0x590 [kvm]
     kvm_vcpu_ioctl+0x22a/0x5d0 [kvm]
     __x64_sys_ioctl+0x84/0xc0
     do_syscall_64+0x33/0x40
     entry_SYSCALL_64_after_hwframe+0x44/0xae

Fixes by moving the initializing of AMD IOMMU interrupt remapping mode
(amd_iommu_guest_ir) earlier before setting up the
amd_iommu_irq_ops.capability with appropriate IRQ_POSTING_CAP flag.

[joro:	Squashed the two patches and limited
	check_features_on_all_iommus() to CONFIG_IRQ_REMAP
	to fix a compile warning.]

Signed-off-by: Wei Huang <wei.huang2@amd.com>
Co-developed-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Link: https://lore.kernel.org/r/20210820202957.187572-2-suravee.suthikulpanit@amd.com
Link: https://lore.kernel.org/r/20210820202957.187572-3-suravee.suthikulpanit@amd.com
Fixes: 8bda0cfbdc1a ("iommu/amd: Detect and initialize guest vAPIC log")
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/init.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index bdcf167b4afe..4e753d1860b3 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -297,6 +297,22 @@ int amd_iommu_get_num_iommus(void)
 	return amd_iommus_present;
 }
 
+#ifdef CONFIG_IRQ_REMAP
+static bool check_feature_on_all_iommus(u64 mask)
+{
+	bool ret = false;
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu) {
+		ret = iommu_feature(iommu, mask);
+		if (!ret)
+			return false;
+	}
+
+	return true;
+}
+#endif
+
 /*
  * For IVHD type 0x11/0x40, EFR is also available via IVHD.
  * Default to IVHD EFR since it is available sooner
@@ -853,13 +869,6 @@ static int iommu_init_ga(struct amd_iommu *iommu)
 	int ret = 0;
 
 #ifdef CONFIG_IRQ_REMAP
-	/* Note: We have already checked GASup from IVRS table.
-	 *       Now, we need to make sure that GAMSup is set.
-	 */
-	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
-	    !iommu_feature(iommu, FEATURE_GAM_VAPIC))
-		amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
-
 	ret = iommu_init_ga_log(iommu);
 #endif /* CONFIG_IRQ_REMAP */
 
@@ -2479,6 +2488,14 @@ static void early_enable_iommus(void)
 	}
 
 #ifdef CONFIG_IRQ_REMAP
+	/*
+	 * Note: We have already checked GASup from IVRS table.
+	 *       Now, we need to make sure that GAMSup is set.
+	 */
+	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) &&
+	    !check_feature_on_all_iommus(FEATURE_GAM_VAPIC))
+		amd_iommu_guest_ir = AMD_IOMMU_GUEST_IR_LEGACY_GA;
+
 	if (AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
 		amd_iommu_irq_ops.capability |= (1 << IRQ_POSTING_CAP);
 #endif

From eb03f2d2f6a4da25d286613717d10add9ce9f175 Mon Sep 17 00:00:00 2001
From: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Date: Fri, 20 Aug 2021 15:29:57 -0500
Subject: [PATCH 2/5] iommu/amd: Remove iommu_init_ga()

Since the function has been simplified and only call iommu_init_ga_log(),
remove the function and replace with iommu_init_ga_log() instead.

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Link: https://lore.kernel.org/r/20210820202957.187572-4-suravee.suthikulpanit@amd.com
Fixes: 8bda0cfbdc1a ("iommu/amd: Detect and initialize guest vAPIC log")
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/amd/init.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c
index 4e753d1860b3..2a822b229bd0 100644
--- a/drivers/iommu/amd/init.c
+++ b/drivers/iommu/amd/init.c
@@ -829,9 +829,9 @@ static int iommu_ga_log_enable(struct amd_iommu *iommu)
 	return 0;
 }
 
-#ifdef CONFIG_IRQ_REMAP
 static int iommu_init_ga_log(struct amd_iommu *iommu)
 {
+#ifdef CONFIG_IRQ_REMAP
 	u64 entry;
 
 	if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))
@@ -861,18 +861,9 @@ static int iommu_init_ga_log(struct amd_iommu *iommu)
 err_out:
 	free_ga_log(iommu);
 	return -EINVAL;
-}
+#else
+	return 0;
 #endif /* CONFIG_IRQ_REMAP */
-
-static int iommu_init_ga(struct amd_iommu *iommu)
-{
-	int ret = 0;
-
-#ifdef CONFIG_IRQ_REMAP
-	ret = iommu_init_ga_log(iommu);
-#endif /* CONFIG_IRQ_REMAP */
-
-	return ret;
 }
 
 static int __init alloc_cwwb_sem(struct amd_iommu *iommu)
@@ -1854,7 +1845,7 @@ static int __init iommu_init_pci(struct amd_iommu *iommu)
 	if (iommu_feature(iommu, FEATURE_PPR) && alloc_ppr_log(iommu))
 		return -ENOMEM;
 
-	ret = iommu_init_ga(iommu);
+	ret = iommu_init_ga_log(iommu);
 	if (ret)
 		return ret;
 

From a21518cb23a3c7d49bafcb59862fd389fd829d4b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Sat, 28 Aug 2021 15:06:21 +0800
Subject: [PATCH 3/5] iommu/vt-d: Fix PASID leak in intel_svm_unbind_mm()

The mm->pasid will be used in intel_svm_free_pasid() after load_pasid()
during unbinding mm. Clearing it in load_pasid() will cause PASID cannot
be freed in intel_svm_free_pasid().

Additionally mm->pasid was updated already before load_pasid() during pasid
allocation. No need to update it again in load_pasid() during binding mm.
Don't update mm->pasid to avoid the issues in both binding mm and unbinding
mm.

Fixes: 4048377414162 ("iommu/vt-d: Use iommu_sva_alloc(free)_pasid() helpers")
Reported-and-tested-by: Dave Jiang <dave.jiang@intel.com>
Co-developed-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20210826215918.4073446-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210828070622.2437559-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/svm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 2014fe8695ac..a3f84f2ff1ba 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -514,9 +514,6 @@ static void load_pasid(struct mm_struct *mm, u32 pasid)
 {
 	mutex_lock(&mm->context.lock);
 
-	/* Synchronize with READ_ONCE in update_pasid(). */
-	smp_store_release(&mm->pasid, pasid);
-
 	/* Update PASID MSR on all CPUs running the mm's tasks. */
 	on_each_cpu_mask(mm_cpumask(mm), _load_pasid, NULL, true);
 

From 6ef0505158f7ca1b32763a3b038b5d11296b642b Mon Sep 17 00:00:00 2001
From: Fenghua Yu <fenghua.yu@intel.com>
Date: Sat, 28 Aug 2021 15:06:22 +0800
Subject: [PATCH 4/5] iommu/vt-d: Fix a deadlock in intel_svm_drain_prq()

pasid_mutex and dev->iommu->param->lock are held while unbinding mm is
flushing IO page fault workqueue and waiting for all page fault works to
finish. But an in-flight page fault work also need to hold the two locks
while unbinding mm are holding them and waiting for the work to finish.
This may cause an ABBA deadlock issue as shown below:

	idxd 0000:00:0a.0: unbind PASID 2
	======================================================
	WARNING: possible circular locking dependency detected
	5.14.0-rc7+ #549 Not tainted [  186.615245] ----------
	dsa_test/898 is trying to acquire lock:
	ffff888100d854e8 (&param->lock){+.+.}-{3:3}, at:
	iopf_queue_flush_dev+0x29/0x60
	but task is already holding lock:
	ffffffff82b2f7c8 (pasid_mutex){+.+.}-{3:3}, at:
	intel_svm_unbind+0x34/0x1e0
	which lock already depends on the new lock.

	the existing dependency chain (in reverse order) is:

	-> #2 (pasid_mutex){+.+.}-{3:3}:
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       intel_svm_page_response+0x8e/0x260
	       iommu_page_response+0x122/0x200
	       iopf_handle_group+0x1c2/0x240
	       process_one_work+0x2a5/0x5a0
	       worker_thread+0x55/0x400
	       kthread+0x13b/0x160
	       ret_from_fork+0x22/0x30

	-> #1 (&param->fault_param->lock){+.+.}-{3:3}:
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       iommu_report_device_fault+0xc2/0x170
	       prq_event_thread+0x28a/0x580
	       irq_thread_fn+0x28/0x60
	       irq_thread+0xcf/0x180
	       kthread+0x13b/0x160
	       ret_from_fork+0x22/0x30

	-> #0 (&param->lock){+.+.}-{3:3}:
	       __lock_acquire+0x1134/0x1d60
	       lock_acquire+0xc6/0x2e0
	       __mutex_lock+0x75/0x730
	       mutex_lock_nested+0x1b/0x20
	       iopf_queue_flush_dev+0x29/0x60
	       intel_svm_drain_prq+0x127/0x210
	       intel_svm_unbind+0xc5/0x1e0
	       iommu_sva_unbind_device+0x62/0x80
	       idxd_cdev_release+0x15a/0x200 [idxd]
	       __fput+0x9c/0x250
	       ____fput+0xe/0x10
	       task_work_run+0x64/0xa0
	       exit_to_user_mode_prepare+0x227/0x230
	       syscall_exit_to_user_mode+0x2c/0x60
	       do_syscall_64+0x48/0x90
	       entry_SYSCALL_64_after_hwframe+0x44/0xae

	other info that might help us debug this:

	Chain exists of:
	  &param->lock --> &param->fault_param->lock --> pasid_mutex

	 Possible unsafe locking scenario:

	       CPU0                    CPU1
	       ----                    ----
	  lock(pasid_mutex);
				       lock(&param->fault_param->lock);
				       lock(pasid_mutex);
	  lock(&param->lock);

	 *** DEADLOCK ***

	2 locks held by dsa_test/898:
	 #0: ffff888100cc1cc0 (&group->mutex){+.+.}-{3:3}, at:
	 iommu_sva_unbind_device+0x53/0x80
	 #1: ffffffff82b2f7c8 (pasid_mutex){+.+.}-{3:3}, at:
	 intel_svm_unbind+0x34/0x1e0

	stack backtrace:
	CPU: 2 PID: 898 Comm: dsa_test Not tainted 5.14.0-rc7+ #549
	Hardware name: Intel Corporation Kabylake Client platform/KBL S
	DDR4 UD IMM CRB, BIOS KBLSE2R1.R00.X050.P01.1608011715 08/01/2016
	Call Trace:
	 dump_stack_lvl+0x5b/0x74
	 dump_stack+0x10/0x12
	 print_circular_bug.cold+0x13d/0x142
	 check_noncircular+0xf1/0x110
	 __lock_acquire+0x1134/0x1d60
	 lock_acquire+0xc6/0x2e0
	 ? iopf_queue_flush_dev+0x29/0x60
	 ? pci_mmcfg_read+0xde/0x240
	 __mutex_lock+0x75/0x730
	 ? iopf_queue_flush_dev+0x29/0x60
	 ? pci_mmcfg_read+0xfd/0x240
	 ? iopf_queue_flush_dev+0x29/0x60
	 mutex_lock_nested+0x1b/0x20
	 iopf_queue_flush_dev+0x29/0x60
	 intel_svm_drain_prq+0x127/0x210
	 ? intel_pasid_tear_down_entry+0x22e/0x240
	 intel_svm_unbind+0xc5/0x1e0
	 iommu_sva_unbind_device+0x62/0x80
	 idxd_cdev_release+0x15a/0x200

pasid_mutex protects pasid and svm data mapping data. It's unnecessary
to hold pasid_mutex while flushing the workqueue. To fix the deadlock
issue, unlock pasid_pasid during flushing the workqueue to allow the works
to be handled.

Fixes: d5b9e4bfe0d8 ("iommu/vt-d: Report prq to io-pgfault framework")
Reported-and-tested-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Link: https://lore.kernel.org/r/20210826215918.4073446-1-fenghua.yu@intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20210828070622.2437559-3-baolu.lu@linux.intel.com
[joro: Removed timing information from kernel log messages]
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/intel/svm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index a3f84f2ff1ba..0c228787704f 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -789,7 +789,19 @@ static void intel_svm_drain_prq(struct device *dev, u32 pasid)
 		goto prq_retry;
 	}
 
+	/*
+	 * A work in IO page fault workqueue may try to lock pasid_mutex now.
+	 * Holding pasid_mutex while waiting in iopf_queue_flush_dev() for
+	 * all works in the workqueue to finish may cause deadlock.
+	 *
+	 * It's unnecessary to hold pasid_mutex in iopf_queue_flush_dev().
+	 * Unlock it to allow the works to be handled while waiting for
+	 * them to finish.
+	 */
+	lockdep_assert_held(&pasid_mutex);
+	mutex_unlock(&pasid_mutex);
 	iopf_queue_flush_dev(dev);
+	mutex_lock(&pasid_mutex);
 
 	/*
 	 * Perform steps described in VT-d spec CH7.10 to drain page

From 8cc633190b524c678b740c87fa1fc37447151a6b Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 8 Sep 2021 13:55:37 +0100
Subject: [PATCH 5/5] iommu: Clarify default domain Kconfig

Although strictly it is the AMD and Intel drivers which have an existing
expectation of lazy behaviour by default, it ends up being rather
unintuitive to describe this literally in Kconfig. Express it instead as
an architecture dependency, to clarify that it is a valid config-time
decision. The end result is the same since virtio-iommu doesn't support
lazy mode and thus falls back to strict at runtime regardless.

The per-architecture disparity is a matter of historical expectations:
the AMD and Intel drivers have been lazy by default since 2008, and
changing that gets noticed by people asking where their I/O throughput
has gone. Conversely, Arm-based systems with their wider assortment of
IOMMU drivers mostly only support strict mode anyway; only the Arm SMMU
drivers have later grown support for passthrough and lazy mode, for
users who wanted to explicitly trade off isolation for performance.
These days, reducing the default level of isolation in a way which may
go unnoticed by users who expect otherwise hardly seems worth risking
for the sake of one line of Kconfig, so here's where we are.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/69a0c6f17b000b54b8333ee42b3124c1d5a869e2.1631105737.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
---
 drivers/iommu/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f14f2e401073..4aa626cf00d8 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -82,7 +82,7 @@ config IOMMU_DEBUGFS
 choice
 	prompt "IOMMU default domain type"
 	depends on IOMMU_API
-	default IOMMU_DEFAULT_DMA_LAZY if AMD_IOMMU || INTEL_IOMMU
+	default IOMMU_DEFAULT_DMA_LAZY if X86 || IA64
 	default IOMMU_DEFAULT_DMA_STRICT
 	help
 	  Choose the type of IOMMU domain used to manage DMA API usage by