drm/amdgpu: enable ras on gfx9 (v2)
Register ecc interrupts and ecc interrupt handler on gfx9. Add ras support on gfx9 v2: squash in warning fix Signed-off-by: Feifei Xu <Feifei.Xu@amd.com> Signed-off-by: xinhui pan <xinhui.pan@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
8cf12507d3
commit
760a1d5534
@ -258,6 +258,9 @@ struct amdgpu_gfx {
|
||||
/* pipe reservation */
|
||||
struct mutex pipe_reserve_mutex;
|
||||
DECLARE_BITMAP (pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
|
||||
|
||||
/*ras */
|
||||
struct ras_common_if *ras_if;
|
||||
};
|
||||
|
||||
#define amdgpu_gfx_get_gpu_clock_counter(adev) (adev)->gfx.funcs->get_gpu_clock_counter((adev))
|
||||
|
@ -40,6 +40,8 @@
|
||||
|
||||
#include "ivsrcid/gfx/irqsrcs_gfx_9_0.h"
|
||||
|
||||
#include "amdgpu_ras.h"
|
||||
|
||||
#define GFX9_NUM_GFX_RINGS 1
|
||||
#define GFX9_MEC_HPD_SIZE 4096
|
||||
#define RLCG_UCODE_LOADING_START_ADDRESS 0x00002000L
|
||||
@ -1638,6 +1640,18 @@ static int gfx_v9_0_sw_init(void *handle)
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/* ECC error */
|
||||
r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_ECC_ERROR,
|
||||
&adev->gfx.cp_ecc_error_irq);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/* FUE error */
|
||||
r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_FUE_ERROR,
|
||||
&adev->gfx.cp_ecc_error_irq);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE;
|
||||
|
||||
gfx_v9_0_scratch_init(adev);
|
||||
@ -1730,6 +1744,20 @@ static int gfx_v9_0_sw_fini(void *handle)
|
||||
int i;
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
|
||||
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX) &&
|
||||
adev->gfx.ras_if) {
|
||||
struct ras_common_if *ras_if = adev->gfx.ras_if;
|
||||
struct ras_ih_if ih_info = {
|
||||
.head = *ras_if,
|
||||
};
|
||||
|
||||
amdgpu_ras_debugfs_remove(adev, ras_if);
|
||||
amdgpu_ras_sysfs_remove(adev, ras_if);
|
||||
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
||||
amdgpu_ras_feature_enable(adev, ras_if, 0);
|
||||
kfree(ras_if);
|
||||
}
|
||||
|
||||
amdgpu_bo_free_kernel(&adev->gds.oa_gfx_bo, NULL, NULL);
|
||||
amdgpu_bo_free_kernel(&adev->gds.gws_gfx_bo, NULL, NULL);
|
||||
amdgpu_bo_free_kernel(&adev->gds.gds_gfx_bo, NULL, NULL);
|
||||
@ -3304,6 +3332,7 @@ static int gfx_v9_0_hw_fini(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
|
||||
amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0);
|
||||
amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
|
||||
amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
|
||||
|
||||
@ -3493,6 +3522,77 @@ static int gfx_v9_0_early_init(void *handle)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
struct amdgpu_iv_entry *entry);
|
||||
|
||||
static int gfx_v9_0_ecc_late_init(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
struct ras_common_if **ras_if = &adev->gfx.ras_if;
|
||||
struct ras_ih_if ih_info = {
|
||||
.cb = gfx_v9_0_process_ras_data_cb,
|
||||
};
|
||||
struct ras_fs_if fs_info = {
|
||||
.sysfs_name = "gfx_err_count",
|
||||
.debugfs_name = "gfx_err_inject",
|
||||
};
|
||||
struct ras_common_if ras_block = {
|
||||
.block = AMDGPU_RAS_BLOCK__GFX,
|
||||
.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
|
||||
.sub_block_index = 0,
|
||||
.name = "gfx",
|
||||
};
|
||||
int r;
|
||||
|
||||
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
|
||||
amdgpu_ras_feature_enable(adev, &ras_block, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
*ras_if = kmalloc(sizeof(**ras_if), GFP_KERNEL);
|
||||
if (!*ras_if)
|
||||
return -ENOMEM;
|
||||
|
||||
**ras_if = ras_block;
|
||||
|
||||
r = amdgpu_ras_feature_enable(adev, *ras_if, 1);
|
||||
if (r)
|
||||
goto feature;
|
||||
|
||||
ih_info.head = **ras_if;
|
||||
fs_info.head = **ras_if;
|
||||
|
||||
r = amdgpu_ras_interrupt_add_handler(adev, &ih_info);
|
||||
if (r)
|
||||
goto interrupt;
|
||||
|
||||
r = amdgpu_ras_debugfs_create(adev, &fs_info);
|
||||
if (r)
|
||||
goto debugfs;
|
||||
|
||||
r = amdgpu_ras_sysfs_create(adev, &fs_info);
|
||||
if (r)
|
||||
goto sysfs;
|
||||
|
||||
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
|
||||
if (r)
|
||||
goto irq;
|
||||
|
||||
return 0;
|
||||
irq:
|
||||
amdgpu_ras_sysfs_remove(adev, *ras_if);
|
||||
sysfs:
|
||||
amdgpu_ras_debugfs_remove(adev, *ras_if);
|
||||
debugfs:
|
||||
amdgpu_ras_interrupt_remove_handler(adev, &ih_info);
|
||||
interrupt:
|
||||
amdgpu_ras_feature_enable(adev, *ras_if, 0);
|
||||
feature:
|
||||
kfree(*ras_if);
|
||||
*ras_if = NULL;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static int gfx_v9_0_late_init(void *handle)
|
||||
{
|
||||
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
|
||||
@ -3506,6 +3606,10 @@ static int gfx_v9_0_late_init(void *handle)
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
r = gfx_v9_0_ecc_late_init(handle);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -4542,6 +4646,45 @@ static int gfx_v9_0_set_priv_inst_fault_state(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ENABLE_ECC_ON_ME_PIPE(me, pipe) \
|
||||
WREG32_FIELD15(GC, 0, CP_ME##me##_PIPE##pipe##_INT_CNTL,\
|
||||
CP_ECC_ERROR_INT_ENABLE, 1)
|
||||
|
||||
#define DISABLE_ECC_ON_ME_PIPE(me, pipe) \
|
||||
WREG32_FIELD15(GC, 0, CP_ME##me##_PIPE##pipe##_INT_CNTL,\
|
||||
CP_ECC_ERROR_INT_ENABLE, 0)
|
||||
|
||||
static int gfx_v9_0_set_cp_ecc_error_state(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
unsigned type,
|
||||
enum amdgpu_interrupt_state state)
|
||||
{
|
||||
switch (state) {
|
||||
case AMDGPU_IRQ_STATE_DISABLE:
|
||||
WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0,
|
||||
CP_ECC_ERROR_INT_ENABLE, 0);
|
||||
DISABLE_ECC_ON_ME_PIPE(1, 0);
|
||||
DISABLE_ECC_ON_ME_PIPE(1, 1);
|
||||
DISABLE_ECC_ON_ME_PIPE(1, 2);
|
||||
DISABLE_ECC_ON_ME_PIPE(1, 3);
|
||||
break;
|
||||
|
||||
case AMDGPU_IRQ_STATE_ENABLE:
|
||||
WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0,
|
||||
CP_ECC_ERROR_INT_ENABLE, 1);
|
||||
ENABLE_ECC_ON_ME_PIPE(1, 0);
|
||||
ENABLE_ECC_ON_ME_PIPE(1, 1);
|
||||
ENABLE_ECC_ON_ME_PIPE(1, 2);
|
||||
ENABLE_ECC_ON_ME_PIPE(1, 3);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static int gfx_v9_0_set_eop_interrupt_state(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *src,
|
||||
unsigned type,
|
||||
@ -4658,6 +4801,27 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
/* TODO ue will trigger an interrupt. */
|
||||
amdgpu_ras_reset_gpu(adev, 0);
|
||||
return AMDGPU_RAS_UE;
|
||||
}
|
||||
|
||||
static int gfx_v9_0_cp_ecc_error_irq(struct amdgpu_device *adev,
|
||||
struct amdgpu_irq_src *source,
|
||||
struct amdgpu_iv_entry *entry)
|
||||
{
|
||||
struct ras_dispatch_if ih_data = {
|
||||
.head = *adev->gfx.ras_if,
|
||||
.entry = entry,
|
||||
};
|
||||
DRM_ERROR("CP ECC ERROR IRQ\n");
|
||||
amdgpu_ras_interrupt_dispatch(adev, &ih_data);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct amd_ip_funcs gfx_v9_0_ip_funcs = {
|
||||
.name = "gfx_v9_0",
|
||||
.early_init = gfx_v9_0_early_init,
|
||||
@ -4819,6 +4983,12 @@ static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_inst_irq_funcs = {
|
||||
.process = gfx_v9_0_priv_inst_irq,
|
||||
};
|
||||
|
||||
static const struct amdgpu_irq_src_funcs gfx_v9_0_cp_ecc_error_irq_funcs = {
|
||||
.set = gfx_v9_0_set_cp_ecc_error_state,
|
||||
.process = gfx_v9_0_cp_ecc_error_irq,
|
||||
};
|
||||
|
||||
|
||||
static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
|
||||
{
|
||||
adev->gfx.eop_irq.num_types = AMDGPU_CP_IRQ_LAST;
|
||||
@ -4829,6 +4999,9 @@ static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev)
|
||||
|
||||
adev->gfx.priv_inst_irq.num_types = 1;
|
||||
adev->gfx.priv_inst_irq.funcs = &gfx_v9_0_priv_inst_irq_funcs;
|
||||
|
||||
adev->gfx.cp_ecc_error_irq.num_types = 2; /*C5 ECC error and C9 FUE error*/
|
||||
adev->gfx.cp_ecc_error_irq.funcs = &gfx_v9_0_cp_ecc_error_irq_funcs;
|
||||
}
|
||||
|
||||
static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev)
|
||||
|
Loading…
x
Reference in New Issue
Block a user