mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-29 22:02:02 +00:00
drm/amdgpu: reset VM when an error is detected
When some problem with the updates of page tables is detected reset the state machine of the VM and re-create all page tables from scratch. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
e84e697d92
commit
55bf196f60
1 changed files with 65 additions and 16 deletions
|
@ -266,6 +266,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
|
||||||
spin_unlock(&vm_bo->vm->status_lock);
|
spin_unlock(&vm_bo->vm->status_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
|
||||||
|
* @vm: the VM which state machine to reset
|
||||||
|
*
|
||||||
|
* Move all vm_bo object in the VM into a state where they will be updated
|
||||||
|
* again during validation.
|
||||||
|
*/
|
||||||
|
static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
|
||||||
|
{
|
||||||
|
struct amdgpu_vm_bo_base *vm_bo, *tmp;
|
||||||
|
|
||||||
|
spin_lock(&vm->status_lock);
|
||||||
|
list_splice_init(&vm->done, &vm->invalidated);
|
||||||
|
list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
|
||||||
|
vm_bo->moved = true;
|
||||||
|
list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
|
||||||
|
struct amdgpu_bo *bo = vm_bo->bo;
|
||||||
|
|
||||||
|
if (!bo || bo->tbo.type != ttm_bo_type_kernel)
|
||||||
|
list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
|
||||||
|
else if (bo->parent)
|
||||||
|
list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
|
||||||
|
}
|
||||||
|
spin_unlock(&vm->status_lock);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
|
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
|
||||||
*
|
*
|
||||||
|
@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
|
||||||
spin_unlock(&adev->mman.bdev.lru_lock);
|
spin_unlock(&adev->mman.bdev.lru_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Create scheduler entities for page table updates */
|
||||||
|
static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
|
||||||
|
struct amdgpu_vm *vm)
|
||||||
|
{
|
||||||
|
int r;
|
||||||
|
|
||||||
|
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
|
||||||
|
adev->vm_manager.vm_pte_scheds,
|
||||||
|
adev->vm_manager.vm_pte_num_scheds, NULL);
|
||||||
|
if (r)
|
||||||
|
goto error;
|
||||||
|
|
||||||
|
return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
|
||||||
|
adev->vm_manager.vm_pte_scheds,
|
||||||
|
adev->vm_manager.vm_pte_num_scheds, NULL);
|
||||||
|
|
||||||
|
error:
|
||||||
|
drm_sched_entity_destroy(&vm->immediate);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Destroy the entities for page table updates again */
|
||||||
|
static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
|
||||||
|
{
|
||||||
|
drm_sched_entity_destroy(&vm->immediate);
|
||||||
|
drm_sched_entity_destroy(&vm->delayed);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* amdgpu_vm_validate_pt_bos - validate the page table BOs
|
* amdgpu_vm_validate_pt_bos - validate the page table BOs
|
||||||
*
|
*
|
||||||
|
@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
|
||||||
struct amdgpu_bo *bo;
|
struct amdgpu_bo *bo;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
|
if (drm_sched_entity_error(&vm->delayed)) {
|
||||||
|
amdgpu_vm_bo_reset_state_machine(vm);
|
||||||
|
amdgpu_vm_fini_entities(vm);
|
||||||
|
r = amdgpu_vm_init_entities(adev, vm);
|
||||||
|
if (r)
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
spin_lock(&vm->status_lock);
|
spin_lock(&vm->status_lock);
|
||||||
while (!list_empty(&vm->evicted)) {
|
while (!list_empty(&vm->evicted)) {
|
||||||
bo_base = list_first_entry(&vm->evicted,
|
bo_base = list_first_entry(&vm->evicted,
|
||||||
|
@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
|
||||||
INIT_LIST_HEAD(&vm->pt_freed);
|
INIT_LIST_HEAD(&vm->pt_freed);
|
||||||
INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
|
INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
|
||||||
|
|
||||||
/* create scheduler entities for page table updates */
|
r = amdgpu_vm_init_entities(adev, vm);
|
||||||
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
|
|
||||||
adev->vm_manager.vm_pte_scheds,
|
|
||||||
adev->vm_manager.vm_pte_num_scheds, NULL);
|
|
||||||
if (r)
|
if (r)
|
||||||
return r;
|
return r;
|
||||||
|
|
||||||
r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
|
|
||||||
adev->vm_manager.vm_pte_scheds,
|
|
||||||
adev->vm_manager.vm_pte_num_scheds, NULL);
|
|
||||||
if (r)
|
|
||||||
goto error_free_immediate;
|
|
||||||
|
|
||||||
vm->pte_support_ats = false;
|
vm->pte_support_ats = false;
|
||||||
vm->is_compute_context = false;
|
vm->is_compute_context = false;
|
||||||
|
|
||||||
|
@ -2121,10 +2174,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
|
||||||
error_free_delayed:
|
error_free_delayed:
|
||||||
dma_fence_put(vm->last_tlb_flush);
|
dma_fence_put(vm->last_tlb_flush);
|
||||||
dma_fence_put(vm->last_unlocked);
|
dma_fence_put(vm->last_unlocked);
|
||||||
drm_sched_entity_destroy(&vm->delayed);
|
amdgpu_vm_fini_entities(vm);
|
||||||
|
|
||||||
error_free_immediate:
|
|
||||||
drm_sched_entity_destroy(&vm->immediate);
|
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
|
||||||
amdgpu_bo_unref(&root);
|
amdgpu_bo_unref(&root);
|
||||||
WARN_ON(vm->root.bo);
|
WARN_ON(vm->root.bo);
|
||||||
|
|
||||||
drm_sched_entity_destroy(&vm->immediate);
|
amdgpu_vm_fini_entities(vm);
|
||||||
drm_sched_entity_destroy(&vm->delayed);
|
|
||||||
|
|
||||||
if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
|
if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
|
||||||
dev_err(adev->dev, "still active bo inside vm\n");
|
dev_err(adev->dev, "still active bo inside vm\n");
|
||||||
|
|
Loading…
Reference in a new issue