drm/amdgpu: enable watchdog feature for SQ of aldebaran
SQ's watchdog timer monitors forward progress, a mask of which waves caused the watchdog timeout is recorded into ras status registers and then trigger a system fatal error event. v2: 1. change *query_timeout_status to *query_sq_timeout_status. 2. move query_sq_timeout_status into amdgpu_ras_do_recovery. 3. add module parameters to enable/disable fatal error event and modify the watchdog timer. v3: 1. remove unused parameters of *enable_watchdog_timer Signed-off-by: Dennis Li <Dennis.Li@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
4abc2567f0
commit
88f8575bca
@ -126,6 +126,12 @@ struct amdgpu_mgpu_info
|
||||
uint32_t num_apu;
|
||||
};
|
||||
|
||||
struct amdgpu_watchdog_timer
|
||||
{
|
||||
bool timeout_fatal_disable;
|
||||
uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */
|
||||
};
|
||||
|
||||
#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256
|
||||
|
||||
/*
|
||||
@ -187,6 +193,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
|
||||
extern int amdgpu_ras_enable;
|
||||
extern uint amdgpu_ras_mask;
|
||||
extern int amdgpu_bad_page_threshold;
|
||||
extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
|
||||
extern int amdgpu_async_gfx_ring;
|
||||
extern int amdgpu_mcbp;
|
||||
extern int amdgpu_discovery;
|
||||
|
@ -175,6 +175,10 @@ struct amdgpu_mgpu_info mgpu_info = {
|
||||
int amdgpu_ras_enable = -1;
|
||||
uint amdgpu_ras_mask = 0xffffffff;
|
||||
int amdgpu_bad_page_threshold = 100;
|
||||
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
|
||||
.timeout_fatal_disable = false,
|
||||
.period = 0x3f, /* about 8s */
|
||||
};
|
||||
|
||||
/**
|
||||
* DOC: vramlimit (int)
|
||||
@ -530,6 +534,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
|
||||
MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1");
|
||||
module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444);
|
||||
|
||||
/**
|
||||
* DOC: timeout_fatal_disable (bool)
|
||||
* Disable Watchdog timeout fatal error event
|
||||
*/
|
||||
MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)");
|
||||
module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644);
|
||||
|
||||
/**
|
||||
* DOC: timeout_period (uint)
|
||||
* Modify the watchdog timeout max_cycles as (1 << period)
|
||||
*/
|
||||
MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)");
|
||||
module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644);
|
||||
|
||||
/**
|
||||
* DOC: si_support (int)
|
||||
* Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled,
|
||||
|
@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs {
|
||||
void (*init_spm_golden)(struct amdgpu_device *adev);
|
||||
void (*query_ras_error_status) (struct amdgpu_device *adev);
|
||||
void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable);
|
||||
void (*enable_watchdog_timer)(struct amdgpu_device *adev);
|
||||
void (*query_sq_timeout_status)(struct amdgpu_device *adev);
|
||||
};
|
||||
|
||||
struct sq_work {
|
||||
|
@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev,
|
||||
case AMDGPU_RAS_BLOCK__GFX:
|
||||
if (adev->gfx.funcs->query_ras_error_status)
|
||||
adev->gfx.funcs->query_ras_error_status(adev);
|
||||
|
||||
if (adev->gfx.funcs->query_sq_timeout_status)
|
||||
adev->gfx.funcs->query_sq_timeout_status(adev);
|
||||
break;
|
||||
case AMDGPU_RAS_BLOCK__MMHUB:
|
||||
if (adev->mmhub.funcs->query_ras_error_status)
|
||||
|
@ -2124,6 +2124,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = {
|
||||
.query_ras_error_count = &gfx_v9_4_2_query_ras_error_count,
|
||||
.reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count,
|
||||
.query_ras_error_status = &gfx_v9_4_2_query_ras_error_status,
|
||||
.enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer,
|
||||
.query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status,
|
||||
};
|
||||
|
||||
static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev)
|
||||
@ -3968,6 +3970,9 @@ static int gfx_v9_0_hw_init(void *handle)
|
||||
if (adev->asic_type == CHIP_ALDEBARAN)
|
||||
gfx_v9_4_2_set_power_brake_sequence(adev);
|
||||
|
||||
if (adev->gfx.funcs->enable_watchdog_timer)
|
||||
adev->gfx.funcs->enable_watchdog_timer(adev);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev)
|
||||
gfx_v9_4_2_query_ea_err_status(adev);
|
||||
gfx_v9_4_2_query_utc_err_status(adev);
|
||||
}
|
||||
|
||||
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t i;
|
||||
uint32_t data;
|
||||
|
||||
data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE,
|
||||
amdgpu_watchdog_timer.timeout_fatal_disable ? 1 :
|
||||
0);
|
||||
data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL,
|
||||
amdgpu_watchdog_timer.period);
|
||||
|
||||
mutex_lock(&adev->grbm_idx_mutex);
|
||||
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
|
||||
gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff);
|
||||
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data);
|
||||
}
|
||||
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
||||
|
||||
static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address)
|
||||
{
|
||||
WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX,
|
||||
(wave << SQ_IND_INDEX__WAVE_ID__SHIFT) |
|
||||
(simd << SQ_IND_INDEX__SIMD_ID__SHIFT) |
|
||||
(address << SQ_IND_INDEX__INDEX__SHIFT) |
|
||||
(SQ_IND_INDEX__FORCE_READ_MASK));
|
||||
return RREG32_SOC15(GC, 0, regSQ_IND_DATA);
|
||||
}
|
||||
|
||||
static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev,
|
||||
uint32_t status)
|
||||
{
|
||||
struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
|
||||
uint32_t i, simd, wave;
|
||||
uint32_t wave_status;
|
||||
uint32_t wave_pc_lo, wave_pc_hi;
|
||||
uint32_t wave_exec_lo, wave_exec_hi;
|
||||
uint32_t wave_inst_dw0, wave_inst_dw1;
|
||||
uint32_t wave_ib_sts;
|
||||
|
||||
for (i = 0; i < 32; i++) {
|
||||
if (!((i << 1) & status))
|
||||
continue;
|
||||
|
||||
simd = i / cu_info->max_waves_per_simd;
|
||||
wave = i % cu_info->max_waves_per_simd;
|
||||
|
||||
wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS);
|
||||
wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO);
|
||||
wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI);
|
||||
wave_exec_lo =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO);
|
||||
wave_exec_hi =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI);
|
||||
wave_inst_dw0 =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0);
|
||||
wave_inst_dw1 =
|
||||
wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1);
|
||||
wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS);
|
||||
|
||||
dev_info(
|
||||
adev->dev,
|
||||
"\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n",
|
||||
simd, wave, wave_status,
|
||||
((uint64_t)wave_pc_hi << 32 | wave_pc_lo),
|
||||
((uint64_t)wave_exec_hi << 32 | wave_exec_lo),
|
||||
((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0),
|
||||
wave_ib_sts);
|
||||
}
|
||||
}
|
||||
|
||||
void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev)
|
||||
{
|
||||
uint32_t se_idx, sh_idx, cu_idx;
|
||||
uint32_t status;
|
||||
|
||||
mutex_lock(&adev->grbm_idx_mutex);
|
||||
for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines;
|
||||
se_idx++) {
|
||||
for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se;
|
||||
sh_idx++) {
|
||||
for (cu_idx = 0;
|
||||
cu_idx < adev->gfx.config.max_cu_per_sh;
|
||||
cu_idx++) {
|
||||
gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx,
|
||||
cu_idx);
|
||||
status = RREG32_SOC15(GC, 0,
|
||||
regSQ_TIMEOUT_STATUS);
|
||||
if (status != 0) {
|
||||
dev_info(
|
||||
adev->dev,
|
||||
"GFX Watchdog Timeout: SE %d, SH %d, CU %d\n",
|
||||
se_idx, sh_idx, cu_idx);
|
||||
gfx_v9_4_2_log_cu_timeout_status(
|
||||
adev, status);
|
||||
}
|
||||
/* clear old status */
|
||||
WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
|
||||
mutex_unlock(&adev->grbm_idx_mutex);
|
||||
}
|
@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if);
|
||||
void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev);
|
||||
int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev,
|
||||
void *ras_error_status);
|
||||
|
||||
void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev);
|
||||
void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev);
|
||||
#endif /* __GFX_V9_4_2_H__ */
|
||||
|
@ -100,6 +100,30 @@
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define WREG32_RLC_EX(prefix, reg, value) \
|
||||
do { \
|
||||
if (amdgpu_sriov_fullaccess(adev)) { \
|
||||
uint32_t i = 0; \
|
||||
uint32_t retries = 50000; \
|
||||
uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0; \
|
||||
uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1; \
|
||||
uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT; \
|
||||
WREG32(r0, value); \
|
||||
WREG32(r1, (reg | 0x80000000)); \
|
||||
WREG32(spare_int, 0x1); \
|
||||
for (i = 0; i < retries; i++) { \
|
||||
u32 tmp = RREG32(r1); \
|
||||
if (!(tmp & 0x80000000)) \
|
||||
break; \
|
||||
udelay(10); \
|
||||
} \
|
||||
if (i >= retries) \
|
||||
pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg); \
|
||||
} else { \
|
||||
WREG32(reg, value); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \
|
||||
do { \
|
||||
uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\
|
||||
@ -142,6 +166,12 @@
|
||||
WREG32_RLC(target_reg, value); \
|
||||
} while (0)
|
||||
|
||||
#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \
|
||||
do { \
|
||||
uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\
|
||||
WREG32_RLC_EX(prefix, target_reg, value); \
|
||||
} while (0)
|
||||
|
||||
#define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \
|
||||
WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \
|
||||
(RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \
|
||||
|
Loading…
x
Reference in New Issue
Block a user