drm/amdkfd: Fix retry fault drain race conditions

The check for whether to drain retry faults must be under the mmap write
lock to serialize with munmap notifier callbacks.

We were also missing checks on child ranges. To fix that, simplify the
logic by using a flag rather than checking on each prange. That also
allows draining less freqeuntly when many ranges are unmapped at once.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Tested-by: Philip Yang <Philip.Yang@amd.com>
Tested-by: Alex Sierra <Alex.Sierra@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Felix Kuehling 2021-11-05 10:52:53 -04:00 committed by Alex Deucher
parent 3aac6aa630
commit a44fe9ee05
2 changed files with 20 additions and 5 deletions

View file

@ -766,6 +766,7 @@ struct svm_range_list {
struct list_head deferred_range_list;
spinlock_t deferred_list_lock;
atomic_t evicted_ranges;
bool drain_pagefaults;
struct delayed_work restore_work;
DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
struct task_struct *faulting_task;

View file

@ -2002,20 +2002,28 @@ static void svm_range_deferred_list_work(struct work_struct *work)
pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n", prange,
prange->start, prange->last, prange->work_item.op);
/* Make sure no stale retry fault coming after range is freed */
if (prange->work_item.op == SVM_OP_UNMAP_RANGE)
svm_range_drain_retry_fault(prange->svms);
mm = prange->work_item.mm;
retry:
mmap_write_lock(mm);
mutex_lock(&svms->lock);
/* Remove from deferred_list must be inside mmap write lock,
/* Checking for the need to drain retry faults must be in
* mmap write lock to serialize with munmap notifiers.
*
* Remove from deferred_list must be inside mmap write lock,
* otherwise, svm_range_list_lock_and_flush_work may hold mmap
* write lock, and continue because deferred_list is empty, then
* deferred_list handle is blocked by mmap write lock.
*/
spin_lock(&svms->deferred_list_lock);
if (unlikely(svms->drain_pagefaults)) {
svms->drain_pagefaults = false;
spin_unlock(&svms->deferred_list_lock);
mutex_unlock(&svms->lock);
mmap_write_unlock(mm);
svm_range_drain_retry_fault(svms);
goto retry;
}
list_del_init(&prange->deferred_list);
spin_unlock(&svms->deferred_list_lock);
@ -2048,6 +2056,12 @@ svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange,
struct mm_struct *mm, enum svm_work_list_ops op)
{
spin_lock(&svms->deferred_list_lock);
/* Make sure pending page faults are drained in the deferred worker
* before the range is freed to avoid straggler interrupts on
* unmapped memory causing "phantom faults".
*/
if (op == SVM_OP_UNMAP_RANGE)
svms->drain_pagefaults = true;
/* if prange is on the deferred list */
if (!list_empty(&prange->deferred_list)) {
pr_debug("update exist prange 0x%p work op %d\n", prange, op);