drm/amdgpu: Add driver infrastructure for MCA RAS

Add MCA specific IP blocks targetting RAS features

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
John Clements 2021-08-24 13:24:25 +08:00 committed by Alex Deucher
parent 3341d30d1c
commit 3907c49218
9 changed files with 388 additions and 2 deletions

View File

@ -58,7 +58,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o \
amdgpu_eeprom.o
amdgpu_eeprom.o amdgpu_mca.o
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
@ -189,6 +189,10 @@ amdgpu-y += \
amdgpu-y += \
amdgpu_reset.o
# add MCA block
amdgpu-y += \
mca_v3_0.o
# add amdkfd interfaces
amdgpu-y += amdgpu_amdkfd.o

View File

@ -108,6 +108,7 @@
#include "amdgpu_df.h"
#include "amdgpu_smuio.h"
#include "amdgpu_fdinfo.h"
#include "amdgpu_mca.h"
#define MAX_GPU_INSTANCE 16
@ -1009,6 +1010,9 @@ struct amdgpu_device {
/* df */
struct amdgpu_df df;
/* MCA */
struct amdgpu_mca mca;
struct amdgpu_ip_block ip_blocks[AMDGPU_MAX_IP_NUM];
uint32_t harvest_ip_mask;
int num_ip_blocks;

View File

@ -471,6 +471,27 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
return r;
}
if (adev->mca.mp0.ras_funcs &&
adev->mca.mp0.ras_funcs->ras_late_init) {
r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
if (r)
return r;
}
if (adev->mca.mp1.ras_funcs &&
adev->mca.mp1.ras_funcs->ras_late_init) {
r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
if (r)
return r;
}
if (adev->mca.mpio.ras_funcs &&
adev->mca.mpio.ras_funcs->ras_late_init) {
r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
if (r)
return r;
}
return 0;
}

View File

@ -0,0 +1,117 @@
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "amdgpu_ras.h"
#include "amdgpu.h"
#include "amdgpu_mca.h"
#include "umc/umc_6_7_0_offset.h"
#include "umc/umc_6_7_0_sh_mask.h"
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
{
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
*error_count += 1;
}
void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count)
{
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
(REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
*error_count += 1;
}
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr)
{
WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
}
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
amdgpu_mca_reset_error_count(adev, mc_status_addr);
}
int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
struct amdgpu_mca_ras *mca_dev)
{
int r;
struct ras_ih_if ih_info = {
.cb = NULL,
};
struct ras_fs_if fs_info = {
.sysfs_name = mca_dev->ras_funcs->sysfs_name,
};
if (!mca_dev->ras_if) {
mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
if (!mca_dev->ras_if)
return -ENOMEM;
mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
mca_dev->ras_if->sub_block_index = 0;
}
ih_info.head = fs_info.head = *mca_dev->ras_if;
r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
&fs_info, &ih_info);
if (r || !amdgpu_ras_is_supported(adev, mca_dev->ras_if->block)) {
kfree(mca_dev->ras_if);
mca_dev->ras_if = NULL;
}
return r;
}
void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
struct amdgpu_mca_ras *mca_dev)
{
struct ras_ih_if ih_info = {
.cb = NULL,
};
if (!mca_dev->ras_if)
return;
amdgpu_ras_late_fini(adev, mca_dev->ras_if, &ih_info);
kfree(mca_dev->ras_if);
mca_dev->ras_if = NULL;
}

View File

@ -0,0 +1,72 @@
/*
* Copyright (C) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __AMDGPU_MCA_H__
#define __AMDGPU_MCA_H__
struct amdgpu_mca_ras_funcs {
int (*ras_late_init)(struct amdgpu_device *adev);
void (*ras_fini)(struct amdgpu_device *adev);
void (*query_ras_error_count)(struct amdgpu_device *adev,
void *ras_error_status);
void (*query_ras_error_address)(struct amdgpu_device *adev,
void *ras_error_status);
uint32_t ras_block;
const char* sysfs_name;
};
struct amdgpu_mca_ras {
struct ras_common_if *ras_if;
const struct amdgpu_mca_ras_funcs *ras_funcs;
};
struct amdgpu_mca_funcs {
void (*init)(struct amdgpu_device *adev);
};
struct amdgpu_mca {
const struct amdgpu_mca_funcs *funcs;
struct amdgpu_mca_ras mp0;
struct amdgpu_mca_ras mp1;
struct amdgpu_mca_ras mpio;
};
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count);
void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
unsigned long *error_count);
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr);
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
uint64_t mc_status_addr,
void *ras_error_status);
int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
struct amdgpu_mca_ras *mca_dev);
void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
struct amdgpu_mca_ras *mca_dev);
#endif

View File

@ -49,6 +49,7 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MP0,
AMDGPU_RAS_BLOCK__MP1,
AMDGPU_RAS_BLOCK__FUSE,
AMDGPU_RAS_BLOCK__MPIO,
AMDGPU_RAS_BLOCK__LAST
};
@ -420,7 +421,7 @@ struct ras_badpage {
/* interfaces for IP */
struct ras_fs_if {
struct ras_common_if head;
char sysfs_name[32];
const char* sysfs_name;
char debugfs_name[32];
};

View File

@ -55,6 +55,7 @@
#include "umc_v6_0.h"
#include "umc_v6_7.h"
#include "hdp_v4_0.h"
#include "mca_v3_0.h"
#include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
@ -1229,6 +1230,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct amdgpu_device *adev)
adev->hdp.ras_funcs = &hdp_v4_0_ras_funcs;
}
static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
{
switch (adev->asic_type) {
case CHIP_ALDEBARAN:
if (!adev->gmc.xgmi.connected_to_cpu)
adev->mca.funcs = &mca_v3_0_funcs;
break;
default:
break;
}
}
static int gmc_v9_0_early_init(void *handle)
{
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@ -1250,6 +1263,7 @@ static int gmc_v9_0_early_init(void *handle)
gmc_v9_0_set_mmhub_ras_funcs(adev);
gmc_v9_0_set_gfxhub_funcs(adev);
gmc_v9_0_set_hdp_ras_funcs(adev);
gmc_v9_0_set_mca_funcs(adev);
adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
adev->gmc.shared_aperture_end =
@ -1461,6 +1475,8 @@ static int gmc_v9_0_sw_init(void *handle)
adev->gfxhub.funcs->init(adev);
adev->mmhub.funcs->init(adev);
if (adev->mca.funcs)
adev->mca.funcs->init(adev);
spin_lock_init(&adev->gmc.invalidate_lock);

View File

@ -0,0 +1,125 @@
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "amdgpu_ras.h"
#include "amdgpu.h"
#include "amdgpu_mca.h"
#define smnMCMP0_STATUST0 0x03830408
#define smnMCMP1_STATUST0 0x03b30408
#define smnMCMPIO_STATUST0 0x0c930408
static void mca_v3_0_mp0_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
amdgpu_mca_query_ras_error_count(adev,
smnMCMP0_STATUST0,
ras_error_status);
}
static int mca_v3_0_mp0_ras_late_init(struct amdgpu_device *adev)
{
return amdgpu_mca_ras_late_init(adev, &adev->mca.mp0);
}
static void mca_v3_0_mp0_ras_fini(struct amdgpu_device *adev)
{
amdgpu_mca_ras_fini(adev, &adev->mca.mp0);
}
const struct amdgpu_mca_ras_funcs mca_v3_0_mp0_ras_funcs = {
.ras_late_init = mca_v3_0_mp0_ras_late_init,
.ras_fini = mca_v3_0_mp0_ras_fini,
.query_ras_error_count = mca_v3_0_mp0_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP0,
.sysfs_name = "mp0_err_count",
};
static void mca_v3_0_mp1_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
amdgpu_mca_query_ras_error_count(adev,
smnMCMP1_STATUST0,
ras_error_status);
}
static int mca_v3_0_mp1_ras_late_init(struct amdgpu_device *adev)
{
return amdgpu_mca_ras_late_init(adev, &adev->mca.mp1);
}
static void mca_v3_0_mp1_ras_fini(struct amdgpu_device *adev)
{
amdgpu_mca_ras_fini(adev, &adev->mca.mp1);
}
const struct amdgpu_mca_ras_funcs mca_v3_0_mp1_ras_funcs = {
.ras_late_init = mca_v3_0_mp1_ras_late_init,
.ras_fini = mca_v3_0_mp1_ras_fini,
.query_ras_error_count = mca_v3_0_mp1_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MP1,
.sysfs_name = "mp1_err_count",
};
static void mca_v3_0_mpio_query_ras_error_count(struct amdgpu_device *adev,
void *ras_error_status)
{
amdgpu_mca_query_ras_error_count(adev,
smnMCMPIO_STATUST0,
ras_error_status);
}
static int mca_v3_0_mpio_ras_late_init(struct amdgpu_device *adev)
{
return amdgpu_mca_ras_late_init(adev, &adev->mca.mpio);
}
static void mca_v3_0_mpio_ras_fini(struct amdgpu_device *adev)
{
amdgpu_mca_ras_fini(adev, &adev->mca.mpio);
}
const struct amdgpu_mca_ras_funcs mca_v3_0_mpio_ras_funcs = {
.ras_late_init = mca_v3_0_mpio_ras_late_init,
.ras_fini = mca_v3_0_mpio_ras_fini,
.query_ras_error_count = mca_v3_0_mpio_query_ras_error_count,
.query_ras_error_address = NULL,
.ras_block = AMDGPU_RAS_BLOCK__MPIO,
.sysfs_name = "mpio_err_count",
};
static void mca_v3_0_init(struct amdgpu_device *adev)
{
struct amdgpu_mca *mca = &adev->mca;
mca->mp0.ras_funcs = &mca_v3_0_mp0_ras_funcs;
mca->mp1.ras_funcs = &mca_v3_0_mp1_ras_funcs;
mca->mpio.ras_funcs = &mca_v3_0_mpio_ras_funcs;
}
const struct amdgpu_mca_funcs mca_v3_0_funcs = {
.init = mca_v3_0_init,
};

View File

@ -0,0 +1,26 @@
/*
* Copyright (C) 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef __MCA_V3_0_H__
#define __MCA_V3_0_H__
extern const struct amdgpu_mca_funcs mca_v3_0_funcs;
#endif