drm/amdgpu: Support multiple error query modes
Direct error query mode and firmware error query mode are supported for now. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Yang Wang <kevinyang.wang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
07c1db7036
commit
8cc0f5669e
@ -1165,13 +1165,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
|
||||
}
|
||||
}
|
||||
|
||||
/* query/inject/cure begin */
|
||||
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
|
||||
struct ras_query_if *info)
|
||||
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
|
||||
struct ras_query_if *info,
|
||||
struct ras_err_data *err_data,
|
||||
unsigned int error_query_mode)
|
||||
{
|
||||
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
|
||||
struct amdgpu_ras_block_object *block_obj = NULL;
|
||||
|
||||
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
|
||||
return -EINVAL;
|
||||
|
||||
if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
|
||||
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
|
||||
amdgpu_ras_get_ecc_info(adev, err_data);
|
||||
} else {
|
||||
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
|
||||
if (!block_obj || !block_obj->hw_ops) {
|
||||
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
|
||||
get_ras_block_str(&info->head));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (block_obj->hw_ops->query_ras_error_count)
|
||||
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
|
||||
|
||||
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
|
||||
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
|
||||
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
|
||||
if (block_obj->hw_ops->query_ras_error_status)
|
||||
block_obj->hw_ops->query_ras_error_status(adev);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* FIXME: add code to check return value later */
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
|
||||
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* query/inject/cure begin */
|
||||
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
|
||||
{
|
||||
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
|
||||
struct ras_err_data err_data;
|
||||
unsigned int error_query_mode;
|
||||
int ret;
|
||||
|
||||
if (!obj)
|
||||
@ -1181,27 +1221,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
|
||||
amdgpu_ras_get_ecc_info(adev, &err_data);
|
||||
} else {
|
||||
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
|
||||
if (!block_obj || !block_obj->hw_ops) {
|
||||
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
|
||||
get_ras_block_str(&info->head));
|
||||
ret = -EINVAL;
|
||||
goto out_fini_err_data;
|
||||
}
|
||||
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
|
||||
return -EINVAL;
|
||||
|
||||
if (block_obj->hw_ops->query_ras_error_count)
|
||||
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
|
||||
|
||||
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
|
||||
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
|
||||
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
|
||||
if (block_obj->hw_ops->query_ras_error_status)
|
||||
block_obj->hw_ops->query_ras_error_status(adev);
|
||||
}
|
||||
}
|
||||
ret = amdgpu_ras_query_error_status_helper(adev, info,
|
||||
&err_data,
|
||||
error_query_mode);
|
||||
if (ret)
|
||||
goto out_fini_err_data;
|
||||
|
||||
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
|
||||
|
||||
@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
|
||||
return true;
|
||||
}
|
||||
|
||||
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
|
||||
unsigned int *error_query_mode)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
|
||||
|
||||
if (!con) {
|
||||
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (mca_funcs && mca_funcs->mca_set_debug_mode)
|
||||
*error_query_mode =
|
||||
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
|
||||
else
|
||||
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Register each ip ras block into amdgpu ras */
|
||||
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
|
||||
struct amdgpu_ras_block_object *ras_block_obj)
|
||||
|
@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
|
||||
AMDGPU_RAS_PT,
|
||||
};
|
||||
|
||||
enum amdgpu_ras_error_query_mode {
|
||||
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
|
||||
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
|
||||
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
|
||||
};
|
||||
|
||||
/* ras error status reisger fields */
|
||||
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
|
||||
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
|
||||
@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
|
||||
|
||||
void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
|
||||
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
|
||||
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
|
||||
unsigned int *mode);
|
||||
|
||||
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
|
||||
struct amdgpu_ras_block_object *ras_block_obj);
|
||||
|
Loading…
x
Reference in New Issue
Block a user