drm/amdgpu: Adjust removal control flow for smu v13_0_2

Adjust removal control flow for smu v13_0_2:
   During amdgpu uninstallation, when removing the first
device, the kernel needs to first send a mode1reset message
to all gpu devices. Otherwise, smu initialization will fail
the next time amdgpu is installed.

V2:
1. Update commit comments.
2. Remove the global variable amdgpu_device_remove_cnt
   and add a variable to the structure amdgpu_hive_info.
3. Use hive to detect the first removed device instead of
   a global variable.

V3:
 1. Update commit comments.
 2. Split a patch into multiple patches.
 3. The current patch does:
    a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into
       the existing gpu recover path, which make all devices
       in hive list only have HW reset but no resume (except
       the base IP).
    b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and
       AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover
       in amdgpu_pci_remove when removing the first device in
       hive list.
    c. When removing the first device, the IP blocks keyword
       function call sequence is as follows:
.suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini.
   ^                           |
   |-<----------<---------<----|
	The first three sequences are because of a call to
        amdgpu_device_gpu_recover. The three sequences will be
        executed in a loop until all devices in the hive list
        are iterated.
        The sequences starting from .hw_fini only apply to the
        first device. Since .suspend has been called before,
        except the resumed phase1 basic ip blocks, all other ip
        blocks .hw_fini of current device will do nothing.
     d. When removing other devices, the calling sequences is the
        same as legacy:
	   .hw_fini -> .early_fini -> .sw_fini.
	Since .suspend has been called when removing the first device,
        except the resumed phase1 basic ip blocks, all of other ip
        blocks .hw_fini of current device will do nothing.

Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
YiPeng Chai 2022-09-07 16:07:42 +08:00 committed by Alex Deucher
parent e4cf73fdfa
commit f5c7e77970
4 changed files with 62 additions and 0 deletions

View File

@ -4749,6 +4749,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
bool gpu_reset_for_dev_remove = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
@ -4768,6 +4769,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
@ -4812,6 +4817,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
amdgpu_ras_intr_cleared();
}
/* Since the mode1 reset affects base ip blocks, the
* phase1 ip blocks need to be resumed. Otherwise there
* will be a BIOS signature error and the psp bootloader
* can't load kdb on the next amdgpu install.
*/
if (gpu_reset_for_dev_remove) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
amdgpu_device_ip_resume_phase1(tmp_adev);
goto end;
}
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* post card */
@ -5134,6 +5151,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
bool need_emergency_restart = false;
bool audio_suspended = false;
int tmp_vram_lost_counter;
bool gpu_reset_for_dev_remove = false;
gpu_reset_for_dev_remove =
test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
/*
* Special case: RAS triggered and full reset isn't supported
@ -5253,6 +5275,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (gpu_reset_for_dev_remove) {
/* Workaroud for ASICs need to disable SMC first */
amdgpu_device_smu_fini_early(tmp_adev);
}
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
/*TODO Should we stop ?*/
if (r) {
@ -5286,6 +5312,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
adev->asic_reset_res = 0;
goto retry;
}
if (!r && gpu_reset_for_dev_remove)
goto recover_end;
}
skip_hw_reset:
@ -5359,6 +5388,7 @@ skip_sched_resume:
amdgpu_device_unset_mp1_state(tmp_adev);
}
recover_end:
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);

View File

@ -2186,6 +2186,36 @@ amdgpu_pci_remove(struct pci_dev *pdev)
pm_runtime_forbid(dev->dev);
}
if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2)) {
bool need_to_reset_gpu = false;
if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive;
hive = amdgpu_get_xgmi_hive(adev);
if (hive->device_remove_count == 0)
need_to_reset_gpu = true;
hive->device_remove_count++;
amdgpu_put_xgmi_hive(hive);
} else {
need_to_reset_gpu = true;
}
/* Workaround for ASICs need to reset SMU.
* Called only when the first device is removed.
*/
if (need_to_reset_gpu) {
struct amdgpu_reset_context reset_context;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}
}
amdgpu_driver_unload_kms(dev);
drm_dev_unplug(dev);

View File

@ -31,6 +31,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_MODE2_RESET = 2,
AMDGPU_RESET_FOR_DEVICE_REMOVE = 3,
};
struct amdgpu_reset_context {

View File

@ -43,6 +43,7 @@ struct amdgpu_hive_info {
} pstate;
struct amdgpu_reset_domain *reset_domain;
uint32_t device_remove_count;
};
struct amdgpu_pcs_ras_field {