drm/amdgpu: add RAS is_rma flag
Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
15c2990e0f
commit
b95fa494d6
@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data **data;
|
||||
u32 max_eeprom_records_count = 0;
|
||||
bool exc_err_limit = false;
|
||||
int ret;
|
||||
|
||||
if (!con || amdgpu_sriov_vf(adev))
|
||||
@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
*/
|
||||
if (adev->gmc.xgmi.pending_reset)
|
||||
return 0;
|
||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
|
||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
|
||||
/*
|
||||
* This calling fails when exc_err_limit is true or
|
||||
* This calling fails when is_rma is true or
|
||||
* ret != 0.
|
||||
*/
|
||||
if (exc_err_limit || ret)
|
||||
if (con->is_rma || ret)
|
||||
goto free;
|
||||
|
||||
if (con->eeprom_control.ras_num_recs) {
|
||||
@ -3016,7 +3015,7 @@ out:
|
||||
* Except error threshold exceeding case, other failure cases in this
|
||||
* function would not fail amdgpu driver init.
|
||||
*/
|
||||
if (!exc_err_limit)
|
||||
if (!con->is_rma)
|
||||
ret = 0;
|
||||
else
|
||||
ret = -EINVAL;
|
||||
|
@ -522,6 +522,7 @@ struct amdgpu_ras {
|
||||
bool update_channel_flag;
|
||||
/* Record status of smu mca debug mode */
|
||||
bool is_aca_debug_mode;
|
||||
bool is_rma;
|
||||
|
||||
/* Record special requirements of gpu reset caller */
|
||||
uint32_t gpu_reset_flags;
|
||||
|
@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
|
||||
control->tbl_rai.health_percent = 0;
|
||||
}
|
||||
|
||||
if (amdgpu_bad_page_threshold != -1)
|
||||
ras->is_rma = true;
|
||||
|
||||
/* ignore the -ENOTSUPP return value */
|
||||
amdgpu_dpm_send_rma_reason(adev);
|
||||
}
|
||||
@ -1321,8 +1324,7 @@ Out:
|
||||
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
|
||||
}
|
||||
|
||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||
bool *exceed_err_limit)
|
||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
|
||||
{
|
||||
struct amdgpu_device *adev = to_amdgpu_device(control);
|
||||
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
|
||||
@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
int res;
|
||||
|
||||
*exceed_err_limit = false;
|
||||
ras->is_rma = false;
|
||||
|
||||
if (!__is_ras_eeprom_supported(adev))
|
||||
return 0;
|
||||
@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
|
||||
res = 0;
|
||||
} else {
|
||||
*exceed_err_limit = true;
|
||||
ras->is_rma = true;
|
||||
dev_err(adev->dev,
|
||||
"RAS records:%d exceed threshold:%d, "
|
||||
"GPU will not be initialized. Replace this GPU or increase the threshold",
|
||||
|
@ -129,8 +129,7 @@ struct eeprom_table_record {
|
||||
unsigned char mcumc_id;
|
||||
} __packed;
|
||||
|
||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
|
||||
bool *exceed_err_limit);
|
||||
int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
|
||||
|
||||
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user