drm/amdgpu: add pcs xgmi v6.4.0 ras support
add pcs xgmi v6.4.0 ras support Signed-off-by: Yang Wang <kevinyang.wang@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
4abf0b0bdf
commit
27d80f7d68
@ -113,6 +113,43 @@ static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
|
||||
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
|
||||
};
|
||||
|
||||
static const u64 xgmi_v6_4_0_mca_base_array[] = {
|
||||
0x11a09200,
|
||||
0x11b09200,
|
||||
};
|
||||
|
||||
static const char *xgmi_v6_4_0_ras_error_code_ext[32] = {
|
||||
[0x00] = "XGMI PCS DataLossErr",
|
||||
[0x01] = "XGMI PCS TrainingErr",
|
||||
[0x02] = "XGMI PCS FlowCtrlAckErr",
|
||||
[0x03] = "XGMI PCS RxFifoUnderflowErr",
|
||||
[0x04] = "XGMI PCS RxFifoOverflowErr",
|
||||
[0x05] = "XGMI PCS CRCErr",
|
||||
[0x06] = "XGMI PCS BERExceededErr",
|
||||
[0x07] = "XGMI PCS TxMetaDataErr",
|
||||
[0x08] = "XGMI PCS ReplayBufParityErr",
|
||||
[0x09] = "XGMI PCS DataParityErr",
|
||||
[0x0a] = "XGMI PCS ReplayFifoOverflowErr",
|
||||
[0x0b] = "XGMI PCS ReplayFifoUnderflowErr",
|
||||
[0x0c] = "XGMI PCS ElasticFifoOverflowErr",
|
||||
[0x0d] = "XGMI PCS DeskewErr",
|
||||
[0x0e] = "XGMI PCS FlowCtrlCRCErr",
|
||||
[0x0f] = "XGMI PCS DataStartupLimitErr",
|
||||
[0x10] = "XGMI PCS FCInitTimeoutErr",
|
||||
[0x11] = "XGMI PCS RecoveryTimeoutErr",
|
||||
[0x12] = "XGMI PCS ReadySerialTimeoutErr",
|
||||
[0x13] = "XGMI PCS ReadySerialAttemptErr",
|
||||
[0x14] = "XGMI PCS RecoveryAttemptErr",
|
||||
[0x15] = "XGMI PCS RecoveryRelockAttemptErr",
|
||||
[0x16] = "XGMI PCS ReplayAttemptErr",
|
||||
[0x17] = "XGMI PCS SyncHdrErr",
|
||||
[0x18] = "XGMI PCS TxReplayTimeoutErr",
|
||||
[0x19] = "XGMI PCS RxReplayTimeoutErr",
|
||||
[0x1a] = "XGMI PCS LinkSubTxTimeoutErr",
|
||||
[0x1b] = "XGMI PCS LinkSubRxTimeoutErr",
|
||||
[0x1c] = "XGMI PCS RxCMDPktErr",
|
||||
};
|
||||
|
||||
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
|
||||
{"XGMI PCS DataLossErr",
|
||||
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
|
||||
@ -936,7 +973,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg
|
||||
WREG32_PCIE(pcs_status_reg, 0);
|
||||
}
|
||||
|
||||
static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t i;
|
||||
|
||||
@ -974,6 +1011,39 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
}
|
||||
}
|
||||
|
||||
static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base)
|
||||
{
|
||||
WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL);
|
||||
}
|
||||
|
||||
static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
|
||||
__xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]);
|
||||
}
|
||||
|
||||
static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
{
|
||||
int i;
|
||||
|
||||
for_each_inst(i, adev->aid_mask)
|
||||
xgmi_v6_4_0_reset_error_count(adev, i);
|
||||
}
|
||||
|
||||
static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
||||
{
|
||||
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
|
||||
case IP_VERSION(6, 4, 0):
|
||||
xgmi_v6_4_0_reset_ras_error_count(adev);
|
||||
break;
|
||||
default:
|
||||
amdgpu_xgmi_legacy_reset_ras_error_count(adev);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
|
||||
uint32_t value,
|
||||
uint32_t mask_value,
|
||||
@ -1025,8 +1095,8 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
int i, supported = 1;
|
||||
@ -1121,6 +1191,88 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
err_data->ce_count += ce_cnt;
|
||||
}
|
||||
|
||||
static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status)
|
||||
{
|
||||
const char *error_str;
|
||||
int ext_error_code;
|
||||
|
||||
ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status);
|
||||
|
||||
error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
|
||||
xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
|
||||
if (error_str)
|
||||
dev_info(adev->dev, "%s detected\n", error_str);
|
||||
|
||||
switch (ext_error_code) {
|
||||
case 0:
|
||||
return AMDGPU_MCA_ERROR_TYPE_UE;
|
||||
case 6:
|
||||
return AMDGPU_MCA_ERROR_TYPE_CE;
|
||||
default:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info,
|
||||
u64 mca_base, struct ras_err_data *err_data)
|
||||
{
|
||||
int xgmi_inst = mcm_info->die_id;
|
||||
u64 status = 0;
|
||||
|
||||
status = RREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS);
|
||||
if (!MCA_REG__STATUS__VAL(status))
|
||||
return;
|
||||
|
||||
switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
|
||||
case AMDGPU_MCA_ERROR_TYPE_UE:
|
||||
amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
|
||||
break;
|
||||
case AMDGPU_MCA_ERROR_TYPE_CE:
|
||||
amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL);
|
||||
}
|
||||
|
||||
static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data)
|
||||
{
|
||||
struct amdgpu_smuio_mcm_config_info mcm_info = {
|
||||
.socket_id = adev->smuio.funcs->get_socket_id(adev),
|
||||
.die_id = xgmi_inst,
|
||||
};
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
|
||||
__xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data);
|
||||
}
|
||||
|
||||
static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
|
||||
{
|
||||
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
|
||||
int i;
|
||||
|
||||
for_each_inst(i, adev->aid_mask)
|
||||
xgmi_v6_4_0_query_error_count(adev, i, err_data);
|
||||
}
|
||||
|
||||
static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status)
|
||||
{
|
||||
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
|
||||
case IP_VERSION(6, 4, 0):
|
||||
xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status);
|
||||
break;
|
||||
default:
|
||||
amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Trigger XGMI/WAFL error */
|
||||
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
|
||||
void *inject_if, uint32_t instance_mask)
|
||||
|
@ -204,4 +204,10 @@
|
||||
+ adev->asic_funcs->encode_ext_smn_addressing(ext), \
|
||||
value) \
|
||||
|
||||
#define RREG64_MCA(ext, mca_base, idx) \
|
||||
RREG64_PCIE_EXT(adev->asic_funcs->encode_ext_smn_addressing(ext) + mca_base + (idx * 8))
|
||||
|
||||
#define WREG64_MCA(ext, mca_base, idx, val) \
|
||||
WREG64_PCIE_EXT(adev->asic_funcs->encode_ext_smn_addressing(ext) + mca_base + (idx * 8), val)
|
||||
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user