drm/amdgpu: break GPU recovery once it's in bad state(v4)

When GPU executes recovery and retriving bad GPU tag from external eerpom device, the recovery will be broken and error message is printed as well for user's awareness. v2: Refine warning message in threshold reaching case, and fix spelling typo. v3: Fix explicit calling of bad gpu. v4: Rename function names. Signed-off-by: Guchun Chen <guchun.chen@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2024-09-27 12:57:53 +00:00 · 2020-07-23 16:20:02 +08:00 · 2020-07-23 16:20:02 +08:00 · e8fbaf0342
commit e8fbaf0342
parent 9c06f91ff2
5 changed files with 79 additions and 3 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@ -4125,8 +4125,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

 				amdgpu_fbdev_set_suspend(tmp_adev, 0);

-				/* must succeed. */
-				amdgpu_ras_resume(tmp_adev);
+				/*
+				 * The GPU enters bad state once faulty pages
+				 * by ECC has reached the threshold, and ras
+				 * recovery is scheduled next. So add one check
+				 * here to break recovery if it indeed exceeds
+				 * bad page threshold, and remind user to
+				 * retire this GPU or setting one bigger
+				 * bad_page_threshold value to fix this once
+				 * probing driver again.
+				 */
+				if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
+					/* must succeed. */
+					amdgpu_ras_resume(tmp_adev);
+				} else {
+					r = -EINVAL;
+					goto out;
+				}

 				/* Update PSP FW topology after reset */
 				if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
@ -4134,7 +4149,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 			}
 		}

-
 out:
 		if (!r) {
 			amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@ -2205,3 +2205,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev)

 	return false;
 }
+
+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	bool exc_err_limit = false;
+
+	if (con && (amdgpu_bad_page_threshold != 0))
+		amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control,
+						&exc_err_limit);
+
+	/*
+	 * We are only interested in variable exc_err_limit,
+	 * as it says if GPU is in bad state or not.
+	 */
+	return exc_err_limit;
+}
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev);
 unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 		bool is_ce);

+bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev);
+
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 		struct eeprom_table_record *bps, int pages);
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address)
 	return curr_address;
 }

+int amdgpu_ras_eeprom_check_err_threshold(
+				struct amdgpu_ras_eeprom_control *control,
+				bool *exceed_err_limit)
+{
+	struct amdgpu_device *adev = to_amdgpu_device(control);
+	unsigned char buff[EEPROM_ADDRESS_SIZE +
+			EEPROM_TABLE_HEADER_SIZE] = { 0 };
+	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+	struct i2c_msg msg = {
+			.addr = control->i2c_address,
+			.flags = I2C_M_RD,
+			.len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE,
+			.buf = buff,
+	};
+	int ret;
+
+	*exceed_err_limit = false;
+
+	/* read EEPROM table header */
+	mutex_lock(&control->tbl_mutex);
+	ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1);
+	if (ret < 1) {
+		dev_err(adev->dev, "Failed to read EEPROM table header.\n");
+		goto err;
+	}
+
+	__decode_table_header_from_buff(hdr, &buff[2]);
+
+	if (hdr->header == EEPROM_TABLE_HDR_BAD) {
+		dev_warn(adev->dev, "This GPU is in BAD status.");
+		dev_warn(adev->dev, "Please retire it or setting one bigger "
+				"threshold value when reloading driver.\n");
+		*exceed_err_limit = true;
+	}
+
+err:
+	mutex_unlock(&control->tbl_mutex);
+	return 0;
+}
+
 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 					    struct eeprom_table_record *records,
 					    bool write,
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@ -80,6 +80,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 			bool *exceed_err_limit);
 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);

+int amdgpu_ras_eeprom_check_err_threshold(
+				struct amdgpu_ras_eeprom_control *control,
+				bool *exceed_err_limit);
+
 int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control,
 					    struct eeprom_table_record *records,
 					    bool write,