drm/amdgpu: umc v12_0 logs ecc errors
1. umc v12_0 logs ecc errors. 2. Reserve newly detected ecc error pages. 3. Add tag for bad pages, so that they can be retired later. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
b2aa6b108d
commit
f27defca68
@ -21,10 +21,13 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/sort.h>
|
||||
#include "amdgpu.h"
|
||||
#include "umc_v6_7.h"
|
||||
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
|
||||
|
||||
#define MAX_UMC_HASH_STRING_SIZE 256
|
||||
|
||||
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
|
||||
struct ras_err_data *err_data, uint64_t err_addr,
|
||||
uint32_t ch_inst, uint32_t umc_inst)
|
||||
@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
|
||||
status, ipid, addr);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
|
||||
{
|
||||
uint64_t *addr_a = (uint64_t *)a;
|
||||
uint64_t *addr_b = (uint64_t *)b;
|
||||
|
||||
if (*addr_a > *addr_b)
|
||||
return 1;
|
||||
else if (*addr_a < *addr_b)
|
||||
return -1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Use string hash to avoid logging the same bad pages repeatedly */
|
||||
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
|
||||
uint64_t *pfns, int len, uint64_t *val)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
|
||||
int offset = 0, i = 0;
|
||||
uint64_t hash_val;
|
||||
|
||||
if (!pfns || !len)
|
||||
return -EINVAL;
|
||||
|
||||
sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
|
||||
|
||||
for (i = 0; i < len; i++)
|
||||
offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
|
||||
|
||||
hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
|
||||
|
||||
*val = hash_val;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
|
||||
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_ecc_log_info *ecc_log;
|
||||
int ret;
|
||||
|
||||
ecc_log = &con->umc_ecc_log;
|
||||
|
||||
mutex_lock(&ecc_log->lock);
|
||||
ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
|
||||
if (!ret) {
|
||||
struct ras_err_pages *err_pages = &ecc_err->err_pages;
|
||||
int i;
|
||||
|
||||
/* Reserve memory */
|
||||
for (i = 0; i < err_pages->count; i++)
|
||||
amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
|
||||
|
||||
radix_tree_tag_set(ecc_tree,
|
||||
ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
|
||||
}
|
||||
mutex_unlock(&ecc_log->lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -52,6 +52,8 @@
|
||||
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
|
||||
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
|
||||
|
||||
/* Page retirement tag */
|
||||
#define UMC_ECC_NEW_DETECTED_TAG 0x1
|
||||
|
||||
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
|
||||
uint32_t umc_inst, uint32_t ch_inst, void *data);
|
||||
@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
|
||||
|
||||
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
|
||||
uint64_t status, uint64_t ipid, uint64_t addr);
|
||||
|
||||
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
|
||||
uint64_t *pfns, int len, uint64_t *val);
|
||||
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
|
||||
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
|
||||
#endif
|
||||
|
@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
|
||||
uint16_t hwid, mcatype;
|
||||
struct ta_ras_query_address_input addr_in;
|
||||
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
|
||||
uint64_t err_addr;
|
||||
uint64_t err_addr, hash_val = 0;
|
||||
struct ras_ecc_err *ecc_err;
|
||||
int count;
|
||||
int ret;
|
||||
|
||||
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
|
||||
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
|
||||
@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
|
||||
return 0;
|
||||
}
|
||||
|
||||
ret = amdgpu_umc_build_pages_hash(adev,
|
||||
page_pfn, count, &hash_val);
|
||||
if (ret) {
|
||||
dev_err(adev->dev, "Fail to build error pages hash\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
|
||||
if (!ecc_err)
|
||||
return -ENOMEM;
|
||||
|
||||
ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
|
||||
if (!ecc_err->err_pages.pfn) {
|
||||
kfree(ecc_err);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
|
||||
ecc_err->err_pages.count = count;
|
||||
|
||||
ecc_err->hash_index = hash_val;
|
||||
ecc_err->status = status;
|
||||
ecc_err->ipid = ipid;
|
||||
ecc_err->addr = addr;
|
||||
|
||||
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
|
||||
if (ret) {
|
||||
if (ret == -EEXIST)
|
||||
con->umc_ecc_log.de_updated = true;
|
||||
else
|
||||
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
|
||||
|
||||
kfree(ecc_err->err_pages.pfn);
|
||||
kfree(ecc_err);
|
||||
return ret;
|
||||
}
|
||||
|
||||
con->umc_ecc_log.de_updated = true;
|
||||
|
||||
return 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user