drm/amdgpu: add RAS is_rma flag

Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2024-05-23 11:23:20 +08:00 · 2024-05-23 11:23:20 +08:00 · b95fa494d6
commit b95fa494d6
parent 15c2990e0f
4 changed files with 12 additions and 11 deletions
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data;
 	u32  max_eeprom_records_count = 0;
-	bool exc_err_limit = false;
 	int ret;

 	if (!con || amdgpu_sriov_vf(adev))
@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	 */
 	if (adev->gmc.xgmi.pending_reset)
 		return 0;
-	ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
 	/*
-	 * This calling fails when exc_err_limit is true or
+	 * This calling fails when is_rma is true or
 	 * ret != 0.
 	 */
-	if (exc_err_limit || ret)
+	if (con->is_rma || ret)
 		goto free;

 	if (con->eeprom_control.ras_num_recs) {
@ -3016,7 +3015,7 @@ out:
 	 * Except error threshold exceeding case, other failure cases in this
 	 * function would not fail amdgpu driver init.
 	 */
-	if (!exc_err_limit)
+	if (!con->is_rma)
 		ret = 0;
 	else
 		ret = -EINVAL;
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@ -522,6 +522,7 @@ struct amdgpu_ras {
 	bool update_channel_flag;
 	/* Record status of smu mca debug mode */
 	bool is_aca_debug_mode;
+	bool is_rma;

 	/* Record special requirements of gpu reset caller */
 	uint32_t  gpu_reset_flags;
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 			control->tbl_rai.health_percent = 0;
 		}

+		if (amdgpu_bad_page_threshold != -1)
+			ras->is_rma = true;
+
 		/* ignore the -ENOTSUPP return value */
 		amdgpu_dpm_send_rma_reason(adev);
 	}
@ -1321,8 +1324,7 @@ Out:
 	return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }

-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-			   bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 {
 	struct amdgpu_device *adev = to_amdgpu_device(control);
 	unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 	int res;

-	*exceed_err_limit = false;
+	ras->is_rma = false;

 	if (!__is_ras_eeprom_supported(adev))
 		return 0;
@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
 				dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
 				res = 0;
 			} else {
-				*exceed_err_limit = true;
+				ras->is_rma = true;
 				dev_err(adev->dev,
 					"RAS records:%d exceed threshold:%d, "
 					"GPU will not be initialized. Replace this GPU or increase the threshold",
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@ -129,8 +129,7 @@ struct eeprom_table_record {
 	unsigned char mcumc_id;
 } __packed;

-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
-			   bool *exceed_err_limit);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);

 int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);