mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-09-27 12:57:53 +00:00
dmr/amdgpu: Add system auto reboot to RAS.
In case of RAS error allow user configure auto system reboot through ras_ctrl. This is also part of the temproray work around for the RAS hang problem. v4: Use latest kernel API for disk sync. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
7c6e68c777
commit
d5ea093eeb
3 changed files with 23 additions and 2 deletions
|
@ -65,6 +65,8 @@
|
|||
#include "amdgpu_ras.h"
|
||||
#include "amdgpu_pmu.h"
|
||||
|
||||
#include <linux/suspend.h>
|
||||
|
||||
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
|
||||
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
|
||||
MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
|
||||
|
@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
|||
int i, r = 0;
|
||||
bool in_ras_intr = amdgpu_ras_intr_triggered();
|
||||
|
||||
/*
|
||||
* Flush RAM to disk so that after reboot
|
||||
* the user can read log and see why the system rebooted.
|
||||
*/
|
||||
if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
|
||||
|
||||
DRM_WARN("Emergency reboot.");
|
||||
|
||||
ksys_sync_helper();
|
||||
emergency_restart();
|
||||
}
|
||||
|
||||
need_full_reset = job_signaled = false;
|
||||
INIT_LIST_HEAD(&device_list);
|
||||
|
||||
|
|
|
@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
|
|||
op = 1;
|
||||
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
|
||||
op = 2;
|
||||
else if (sscanf(str, "reboot %32s", block_name) == 1)
|
||||
op = 3;
|
||||
else if (str[0] && str[1] && str[2] && str[3])
|
||||
/* ascii string, but commands are not matched. */
|
||||
return -EINVAL;
|
||||
|
@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
|
|||
/* data.inject.address is offset instead of absolute gpu address */
|
||||
ret = amdgpu_ras_error_inject(adev, &data.inject);
|
||||
break;
|
||||
case 3:
|
||||
amdgpu_ras_get_context(adev)->reboot = true;
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
|
@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
|
|||
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
|
||||
{
|
||||
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
|
||||
DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
|
||||
DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
|
||||
|
||||
amdgpu_ras_reset_gpu(adev, false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -334,7 +334,7 @@ struct amdgpu_ras {
|
|||
struct mutex recovery_lock;
|
||||
|
||||
uint32_t flags;
|
||||
|
||||
bool reboot;
|
||||
struct amdgpu_ras_eeprom_control eeprom_control;
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue